Adsense

Donate


主页
Minidx的IFilter Com组件中的一个字符串处理函数

声明:可任意转载,复制,修改,以及用于任何您所希望的目的而与作者无关。 

Minidx的IFilter Com组件中的一个字符串处理函数,过滤掉一些特殊字符,转为标准的ASCII字符

因为要处理包括欧洲以及阿拉伯等其他一些特殊字符,所以case比较多,有需要的自己选择性使用,:) 

// Valid characters are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
//                      [#x10000-#x10FFFF]

nline static void ValidUnicode(wchar_t & ch)
{
   if (ch < 0x0020)     // if less than ASCII space
   {
      if ((ch == 0x000D)      // CR
         || (ch == 0x000A)    // or LF
         || (ch == 0x0009))   // or TAB
         return;                 // it's valid!
      else
         ch = L' ';              // morph to blank
   }
   else if (ch > 0x007e) // or greater than ASCII '~'
   {
      if (ch <= 0xD7FF)
         return;                 // it's valid!
      else if (ch >= 0xF8FF && ch <= 0xFFFD)
         return;                 // it's valid!
      else
         ch = L' ';              // morph to blank
      
      // note we don't support surrogates, private use or high-Unicode 0x10000-0x10FFFF characters
   }
   else
      return;                    // it's valid!
}

static void CleanUpCharacters(size_t chBuf, wchar_t *buf)
{
   // simplified form to make parsing easier.

   buf[chBuf] = 0;   // must be null terminated..

   for (size_t i = 0; i < chBuf; ++i)
   {
      wchar_t & ch = buf[i];

      switch (ch)
      {
         case 0:        // embedded null
         case 0x2000:   // en quad
         case 0x2001:   // em quad
         case 0x2002:   // en space
         case 0x2003:   // em space
         case 0x2004:   // three-per-em space
         case 0x2005:   // four-per-em space
         case 0x2006:   // six-per-em space
         case 0x2007:   // figure space
         case 0x2008:   // puctuation space
         case 0x2009:   // thin space
         case 0x200A:   // hair space
         case 0x200B:   // zero-width space
         case 0x200C:   // zero-width non-joiner
         case 0x200D:   // zero-width joiner
         case 0x202f:   // no-break space
         case 0x3000:   // ideographic space
            ch = L' ';
            break;

         case 0x00B6:   // pilcro
         case 0x2028:   // line seperator
         case 0x2029:   // paragraph seperator
            ch = L'\n';
            break;

         case 0x00AD:   // soft-hyphen
         case 0x00B7:   // middle dot
         case 0x2010:   // hyphen
         case 0x2011:   // non-breaking hyphen
         case 0x2012:   // figure dash
         case 0x2013:   // en dash
         case 0x2014:   // em dash
         case 0x2015:   // quote dash
         case 0x2027:   // hyphenation point
         case 0x2043:   // hyphen bullet
         case 0x208B:   // subscript minus
         case 0xFE31:   // vertical em dash
         case 0xFE32:   // vertical en dash
         case 0xFE58:   // small em dash
         case 0xFE63:   // small hyphen minus
         case 0xFF0D:   // 2007/05/08 TEI zenkaku -
            ch = L'-';
            break;

         case 0x00B0:   // degree
         case 0x2018:   // left single quote
         case 0x2019:   // right single quote
         case 0x201A:   // low right single quote
         case 0x201B:   // high left single quote
         case 0x2032:   // prime
         case 0x2035:   // reversed prime
         case 0x2039:   // left-pointing angle quotation mark
         case 0x203A:   // right-pointing angle quotation mark
            ch = L'\'';
            break;
            
         case 0x201C:   // left double quote
         case 0x201D:   // right double quote
         case 0x201E:   // low right double quote
         case 0x201F:   // high left double quote
         case 0x2033:   // double prime
         case 0x2034:   // triple prime
         case 0x2036:   // reversed double prime
         case 0x2037:   // reversed triple prime
         case 0x00AB:   // left-pointing double angle quotation mark
         case 0x00BB:   // right-pointing double angle quotation mark
         case 0x3003:   // ditto mark
         case 0x301D:   // reversed double prime quotation mark
         case 0x301E:   // double prime quotation mark
         case 0x301F:   // low double prime quotation mark
            ch = L'\"';
            break;
            
         case 0x00A7:   // section-sign
         case 0x2020:   // dagger
         case 0x2021:   // double-dagger
         case 0x2022:   // bullet
         case 0x2023:   // triangle bullet
         case 0x203B:   // reference mark
         case 0xFE55:   // small colon
         case 0xFF1A:   // chinese : 65306
            ch = L':';
            break;

         case 0x2024:   // one dot leader
         case 0x2025:   // two dot leader
         case 0x2026:   // elipsis
         case 0x3002:   // ideographic full stop
         case 0xFE30:   // two dot vertical leader
         case 0xFE52:   // small full stop
         case 0x30FB:   // 2007/05/08 ADD-- zenkaku .
         case 0xFF0E:    // 2007/05/10 ADD-- JP .
            ch = L'.';
            break;

         case 0x3001:   // ideographic comma
         case 0xFE50:   // small comma
         case 0xFE51:   // small ideographic comma
         case 0xFF0C:    // chinese 65292
            ch = L',';
            break;
            
         case 0xFE54:   // small semicolon
         case 0xFF1B:   // 2007/05/08 ADD
            ch = L';';
            break;

         case 0x00A6:   // broken-bar
         case 0x2016:   // double vertical line
            ch = L'|';
            break;

         case 0x2017:   // double low line
         case 0x203E:   // overline
         case 0x203F:   // undertie
         case 0x2040:   // character tie
         case 0xFE33:   // vertical low line
         case 0xFE49:   // dashed overline
         case 0xFE4A:   // centerline overline
         case 0xFE4D:   // dashed low line
         case 0xFE4E:   // centerline low line
            ch = L'_';
            break;
            
         case 0x301C:   // wave dash
         case 0x3030:   // wavy dash
         case 0xFE34:   // vertical wavy low line
         case 0xFE4B:   // wavy overline
         case 0xFE4C:   // double wavy overline
         case 0xFE4F:   // wavy low line
         case 0xFF5E:   // 2007/04/09 TEI  zenkaku
            ch = L'~';
            break;
            
         case 0x2038:   // caret
         case 0x2041:   // caret insertion point
            ch = L'^';
            break;

         case 0x2030:   // per-mille
         case 0x2031:   // per-ten thousand
         case 0xFE6A:   // small per-cent
         case 0xFF05:   // Asia per-cent
            ch = L'%';
            break;
            
         case 0xFE6B:   // small commercial at
            ch = L'@';
            break;
            
         case 0x00A9:   // copyright
            ch = L'c';
            break;

         case 0x00B5:   // micro
            ch = L'u';
            break;
   
         case 0x00AE:   // registered
            ch = L'r';
            break;

         case 0x207A:   // superscript plus
         case 0x208A:   // subscript plus
         case 0xFE62:   // small plus
         case 0xFF0B:   // 2007/05/13 TEI  ---- zenkaku +
            ch = L'+';
            break;
            
         case 0x2044:   // fraction slash
            ch = L'/';
            break;

         case 0x2042:   // asterism
         case 0xFE61:   // small asterisk
         case 0xFF0A:   // Asia asterisk
            ch = L'*';
            break;
            
         case 0x208C:   // subscript equal
         case 0xFE66:   // small equal
            ch = L'=';
            break;
            
         case 0xFE68:   // small reverse solidus
            ch = L'\\';
            break;
            
         case 0xFE5F:   // small number sign
         case 0xFF03:   // Asia number sign
            ch = L'#';
            break;
            
         case 0xFE60:   // small ampersand
         case 0xFF06:   // Asia ampersand
            ch = L'&';
            break;
            
         case 0xFE69:   // small dollar sign
         case 0xFF04:   // Asia dollar sign
            ch = L'$';
            break;
            
         case 0x2045:   // left square bracket with quill
         case 0x3010:   // left black lenticular bracket
         case 0x3016:   // left white lenticular bracket
         case 0x301A:   // left white square bracket
         case 0xFE3B:   // vertical left lenticular bracket
            ch = L'[';
            break;
            
         case 0x2046:   // right square bracket with quill
         case 0x3011:   // right black lenticular bracket
         case 0x3017:   // right white lenticular bracket
         case 0x301B:   // right white square bracket
         case 0xFE3C:   // vertical right lenticular bracket
            ch = L']';
            break;
            
         case 0x208D:   // subscript left parenthesis
         case 0x3014:   // left tortise-shell bracket
         case 0x3018:   // left white tortise-shell bracket
         case 0xFE35:   // vertical left parenthesis
         case 0xFE39:   // vertical left tortise-shell bracket
         case 0xFE59:   // small left parenthesis
         case 0xFE5D:   // small left tortise-shell bracket
         case 0xFF08:   // chinese (
            ch = L'(';
            break;
            
         case 0x208E:   // subscript right parenthesis
         case 0x3015:   // right tortise-shell bracket
         case 0x3019:   // right white tortise-shell bracket
         case 0xFE36:   // vertical right parenthesis
         case 0xFE3A:   // vertical right tortise-shell bracket
         case 0xFE5A:   // small right parenthesis
         case 0xFE5E:   // small right tortise-shell bracket
         case 0xFF09:   // chinese )
            ch = L')';
            break;
            
         case 0x3008:   // left angle bracket
         case 0x300A:   // left double angle bracket
         case 0xFF3D:   // vertical left double angle bracket
         case 0xFF3F:   // vertical left angle bracket
         case 0xFF64:   // small less-than
         case 0xFF1C:   // 2007/04/09  add zenkaku <
            ch = L'<';
            break;
            
         case 0x3009:   // right angle bracket
         case 0x300B:   // right double angle bracket
         case 0xFF3E:   // vertical right double angle bracket
         case 0xFF40:   // vertical right angle bracket
         case 0xFF65:   // small greater-than
         case 0xFF1E:   // 2007/04/09 add zenkaku >
            ch = L'>';
            break;
            
         case 0xFE37:   // vertical left curly bracket
         case 0xFE5B:   // small left curly bracket
            ch = L'{';
            break;
            
         case 0xFE38:   // vertical right curly bracket
         case 0xFE5C:   // small right curly bracket
            ch = L'}';
            break;
            
         case 0x00A1:   // inverted exclamation mark
         case 0x00AC:   // not
         case 0x203C:   // double exclamation mark
         case 0x203D:   // interrobang
         case 0xFE57:   // small exclamation mark
         case 0xFF01:   // chinese !
            ch = L'!';
            break;

         case 0x00BF:   // inverted question mark
         case 0xFE56:   // small question mark
         case 0xFF1F:   // Chinese/Japanese ?
            ch = L'?';
            break;

         case 0x00B9:   // superscript one
            ch = L'1';
            break;

         case 0x00B2:   // superscript two
            ch = L'2';
            break;
            
         case 0x00B3:   // superscript three
            ch = L'3';
            break;

         case 0x2070:   // superscript zero
         case 0x2074:   // superscript four
         case 0x2075:   // superscript five
         case 0x2076:   // superscript six
         case 0x2077:   // superscript seven
         case 0x2078:   // superscript eight
         case 0x2079:   // superscript nine
         case 0x2080:   // subscript zero
         case 0x2081:   // subscript one
         case 0x2082:   // subscript two
         case 0x2083:   // subscript three
         case 0x2084:   // subscript four
         case 0x2085:   // subscript five
         case 0x2086:   // subscript six
         case 0x2087:   // subscript seven
         case 0x2088:   // subscript eight
         case 0x2089:   // subscript nine
         case 0x3021:   // Hangzhou numeral one
         case 0x3022:   // Hangzhou numeral two
         case 0x3023:   // Hangzhou numeral three
         case 0x3024:   // Hangzhou numeral four
         case 0x3025:   // Hangzhou numeral five
         case 0x3026:   // Hangzhou numeral six
         case 0x3027:   // Hangzhou numeral seven
         case 0x3028:   // Hangzhou numeral eight
         case 0x3029:   // Hangzhou numeral nine
            ch = (ch & 0x000F) + L'0';
            break;

         // ONE is at ZERO location... careful
         case 0x3220:   // parenthesized ideograph one
         case 0x3221:   // parenthesized ideograph two
         case 0x3222:   // parenthesized ideograph three
         case 0x3223:   // parenthesized ideograph four
         case 0x3224:   // parenthesized ideograph five
         case 0x3225:   // parenthesized ideograph six
         case 0x3226:   // parenthesized ideograph seven
         case 0x3227:   // parenthesized ideograph eight
         case 0x3228:   // parenthesized ideograph nine
         case 0x3280:   // circled ideograph one
         case 0x3281:   // circled ideograph two
         case 0x3282:   // circled ideograph three
         case 0x3283:   // circled ideograph four
         case 0x3284:   // circled ideograph five
         case 0x3285:   // circled ideograph six
         case 0x3286:   // circled ideograph seven
         case 0x3287:   // circled ideograph eight
         case 0x3288:   // circled ideograph nine
            ch = (ch & 0x000F) + L'1';
            break;
            
         case 0x3007:   // ideographic number zero
         case 0x24EA:   // circled number zero
            ch = L'0';
            break;
            
         default:
            if (0xFF10 <= ch           // Japanese zenkaku 0
                && ch <= 0xFF19)       // Japanese zenkaku 9
            {
               ch = ch - 0xFF10 + L'0';             
            }
            // 2007/05/08 ADD BEGIN--------
            else if (0xFF21 <= ch      // zenkaku A
                     && ch <= 0xFF3A)  // zenkaku Z
            {
               ch = ch - 0xFF21 + L'A';
            }
            else if (0xFF41 <= ch      // zenkaku a
                     && ch <= 0xFF5A)  // zenkaku z
            {
               ch = ch - 0xFF41 + L'a';
            }
            // 2007/05/08 ADD END---------
            else if (0xFF01 <= ch           // fullwidth exclamation mark
                && ch <= 0xFF5E)       // fullwidth tilde
            {
               // the fullwidths line up with ASCII low subset
               ch = ch & 0xFF00 + L'!' - 1;               
            }
            else if (0x2460 <= ch      // circled one
                     && ch <= 0x2468)  // circled nine
            {
               ch = ch - 0x2460 + L'1';
            }
            else if (0x2474 <= ch      // parenthesized one
                     && ch <= 0x247C)  // parenthesized nine
            {
               ch = ch - 0x2474 + L'1';
            }
            else if (0x2488 <= ch      // one full stop
                     && ch <= 0x2490)  // nine full stop
            {
               ch = ch - 0x2488 + L'1';
            }
            else if (0x249C <= ch      // parenthesized small a
                     && ch <= 0x24B5)  // parenthesized small z
            {
               ch = ch - 0x249C + L'a';
            }
            else if (0x24B6 <= ch      // circled capital A
                     && ch <= 0x24CF)  // circled capital Z
            {
               ch = ch - 0x24B6 + L'A';
            }
            else if (0x24D0 <= ch      // circled small a
                     && ch <= 0x24E9)  // circled small z
            {
               ch = ch - 0x24D0 + L'a';
            }
            else if (0x2500 <= ch      // box drawing (begin)
                     && ch <= 0x257F)  // box drawing (end)
            {
               ch = L'|';
            }
            else if (0x2580 <= ch      // block elements (begin)
                     && ch <= 0x259F)  // block elements (end)
            {
               ch = L'#';
            }
            else if (0x25A0 <= ch      // geometric shapes (begin)
                     && ch <= 0x25FF)  // geometric shapes (end)
            {
               ch = L'*';
            }
            else if (0x2600 <= ch      // dingbats (begin)
                     && ch <= 0x267F)  // dingbats (end)
            {
               ch = L'.';
            }
            else
               ValidUnicode(ch);   // validate that it's legit Unicode
            break;
      }
   }
}

 
< Prev   Next >
© 2017 Minidx文件管理系统 | Minidx全文搜索引擎
Minidx! is a professional file management system.