Unicode編碼(二)-中文字符刷選
創(chuàng)新互聯(lián)公司-專業(yè)網(wǎng)站定制、快速模板網(wǎng)站建設(shè)、高性價(jià)比鼓樓網(wǎng)站開發(fā)、企業(yè)建站全套包干低至880元,成熟完善的模板庫(kù),直接使用。一站式鼓樓網(wǎng)站制作公司更省心,省錢,快速模板網(wǎng)站建設(shè)找我們,業(yè)務(wù)覆蓋鼓樓地區(qū)。費(fèi)用合理售后完善,十年實(shí)體公司更值得信賴。
1,UTF-8編碼中三字節(jié)中文字符的刷選方法如下:
int chinese_filter(unsigned char one, unsigned char two ) { int flag = -1; if ( one >= 0x4E && one < 0x9F ) { flag = 1; } else if ( one == 0x9F && two <= 0xCB ) { flag = 1; } else if ( one >= 0x34 && one < 0x4D ) { flag = 1; } else if ( one == 0x4D && two <= 0xB5 ) { flag = 1; } else if ( one == 0x2F && two <= 0xD5 ) { flag = 1; } else if ( one == 0x2E && two >= 0x80 && two <= 0xF3 ) { flag = 1 } else if ( one >= 0xF9 && one < 0xFA ) { flag = 1; } else if ( one == 0xFA && two <= 0xD9 ) { flag = 1; } else if ( one == 0xE8 && two >= 0x15 && two <= 0x6F ) { flag = 1; } else if ( one >= 0xE4 && one < 0xE5 ) { flag = 1; } else if ( one == 0xE5 && two <= 0xE8 ) { flag = 1; } else if ( one == 0xE6 && two <= 0xCF ) { flag = 1; } else if ( one == 0x31 && two >= 0xC0 && two <= 0xE3 ) { flag = 1; } else if ( one == 0x2F && two >= 0xF0 && two <= 0xFB ) { flag = 1; } else if ( one == 0x31 && two >= 0x05 && two <= 0x20 ) { flag = 1; } else if ( one == 0x31 && two >= 0xA0 && two <= 0xBA ) { flag = 1; } return flag; }
2,UTF-8編碼中四字節(jié)中文字符刷選方法如下:
int chinese_filter2( unsigned char one, unsigned char two, unsigned char thr ) { int flag = -1; if ( one == 0x02 && two < 0xA6 ) { /*20000-2A6D6*/ flag = 1; } else if ( one == 0x02 && two == 0xA6 && thr <= 0xD6 ) { flag = 1; } else if ( one == 0x02 && two >= 0xA7 && two < 0xB7 ) { /* 2A700-2B734 */ flag = 1; } else if ( one == 0x02 && two == 0xB7 && thr <= 0x34 ) { flag = 1; } else if ( one == 0x02 && two >= 0xB7 && thr >= 0x40 && two < 0xB8 ) { /* 2B740-2B81D */ flag = 1; } else if ( one == 0x02 && two == 0xB8 && thr <= 0x1D ) { flag = 1; } else if ( one == 0x02 && two >= 0xF8 && two < 0xFA ) { /* 2F800-2FA1D */ flag = 1; } else if ( one == 0x02 && two == 0xFA && thr <= 0x1D ) { flag = 1; } return flag; }
3,UTF-8字符轉(zhuǎn)Unicode編碼:
1)src為輸入的UTF-8字符串
2)unicode為UTF-8字符串轉(zhuǎn)換后輸出的unicode編碼串
3)chs為字符串中刷選出來的中文字符
int utf_to_unicode( unsigned char *src, unsigned char *unicode, unsigned char *chs ) { int size = 0; int ch_len = 0; unsigned char one = 0x00; unsigned char two = 0x00; unsigned char thr = 0x00; unsigned char fou = 0x00; unsigned char fiv = 0x00; unsigned char six = 0x00; /* one使高位,F(xiàn)FFE方式存儲(chǔ),所以第一個(gè)取出來的one是高位 */ while ( *src ) { if ( *src <= 0x80 ) { one = *( src + 0 ); *( unicode + size++ ) = one; src += 1; } else if ( *src >= 0xC0 && *src < 0xE0 ) { one = *( src + 0 ); two = *( src + 1 ); *( unicode + size++ ) = one & 0x03; *( unicode + size++ ) = ( two & 0x3F ) | ( ( one & 0x03 ) << 6 ); src += 2; } else if ( *src >= 0xE0 && *src < 0xF0 ) { one = *( src + 0 ); two = *( src + 1 ); thr = *( src + 2 ); *( unicode + size++ ) = ( ( two & 0x3C ) >> 2 ) | ( ( one & 0x0F ) << 4 ); *( unicode + size++ ) = ( thr & 0x3F ) | ( ( two & 0x03 ) << 6 ); int val = chinese_filter( *( unicode + size - 2 ), *( unicode + size - 1 ) ); if ( val == 1 ) { *( chs + ch_len++ ) = *( src + 0 ); *( chs + ch_len++ ) = *( src + 1 ); *( chs + ch_len++ ) = *( src + 2 ); } src += 3; } else if ( *src >= 0xF0 && *src < 0xF8 ) { one = *( src + 0 ); two = *( src + 1 ); thr = *( src + 2 ); fou = *( src + 3 ); *( unicode + size++ ) = ( ( two & 0x30 ) >> 4 ) | ( ( one & 0x07 ) << 2 ); *( unicode + size++ ) = ( ( thr & 0x3C ) >> 2 ) | ( ( two & 0x0F ) << 4 ); *( unicode + size++ ) = ( fou & 0x3F ) | ( ( thr & 0x03 ) << 6 ); inr val = chinese_filter2( *( unicode + size - 3 ), *( unicode + size - 2 ), *( unicode + size - 1 ) ); if ( val == 1 ) { *( chs + ch_len++ ) = *( src + 0 ); *( chs + ch_len++ ) = *( src + 1 ); *( chs + ch_len++ ) = *( src + 2 ); *( chs + ch_len++ ) = *( src + 3 ); } src += 4; } else if ( *src >= 0xF8 && *src < 0xFC ) { one = *( src + 0 ); two = *( src + 1 ); thr = *( src + 2 ); fou = *( src + 3 ); fiv = *( src + 4 ); *( unicode + size++ ) = one & 0x03; *( unicode + size++ ) = ( ( thr & 0x30 ) >> 4 ) | ( ( two & 0x3F ) << 2 ); *( unicode + size++ ) = ( ( fou & 0x3C ) >> 2 ) | ( ( thr & 0x0F ) << 4 ); *( unicode + size++ ) = ( fiv & 0x3F ) | ( ( fou & 0x03 ) << 6 ); src += 5; } else if ( *src >= 0xFC ) { one = *( src + 0 ); two = *( src + 1 ); thr = *( src + 2 ); fou = *( src + 3 ); fiv = *( src + 4 ); six = *( src + 5 ); *( unicode + size++ ) = ( two & 0x3F ) | ( ( one & 0x01 ) << 6 ); *( unicode + size++ ) = ( ( fou & 0x30 ) >> 4 ) | ( ( thr & 0x3F ) << 2 ); *( unicode + size++ ) = ( ( fiv & 0x3C ) >> 2 ) | ( ( fou & 0x0F ) << 4 ); *( unicode + size++ ) = ( six & 0x3F ) | ( ( fiv & 0x03 ) << 6 ); src += 6; } else { printf( "Error: unknoe scope\n" ); return -1; } } *( unicode + size ) = '\0'; return size; }
4,主函數(shù)測(cè)試程序和Unicode編碼打印程序
void unicode_print( unsigned char *unicode, int size ) { int index = 0; for ( ; index < size; index += 1 ) { printf( "%02X", *( unicode + index ) ); } printf("\n"); }
int main( int argc, char *argv[] ) { unsigned char ch4[] = "一A嚴(yán)嚴(yán)·"; int size = 0; int len = 0; len = strlen( ch4 ); unsigned char unicode[len + 1]; memset( unicode, 0x00, len + 1 ); unsigned char china[len + 1]; memset( china, 0x00, len + 1 ); size = utf_to_unicode( ch4, len + 1, unicode, china ); unicode_print( unicode, size ); printf( "Chinese = %s\n", china ); return 0; }
本文參考文獻(xiàn):http://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php?zfj=kzb