Unicode與UTF-8編碼轉(zhuǎn)換(一)
創(chuàng)新互聯(lián)認為,企業(yè)網(wǎng)站是要賦予品質(zhì)、思維、人性,深入到用戶內(nèi)心的細膩情感,才能真正稱得上企業(yè)網(wǎng)站。創(chuàng)新互聯(lián)根據(jù)每位用戶內(nèi)心最深的需求網(wǎng)站建設(shè)服務,堅實的設(shè)計執(zhí)行是品牌長期視覺塑造的重要支持。
Unicode是一個符號集合,規(guī)定了符號的二進制代碼,而UTF-8是Unicode的一種實現(xiàn),具體Unicode和UTF-8的聯(lián)系如下所示:
Unicode符號范圍 UTF-8編碼規(guī)則 1 | 0000 0000 - 0000 007F | 0xxxxxxx 2 | 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 3 | 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 4 | 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 5 | 0020 0000 - 03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
由圖可清晰的知道Unicode和UTF-8間的聯(lián)系。其中UTF-8編碼規(guī)則中的x就是讓你編碼的地方。接下來逐一舉例說明各段的編碼:
1,范圍0x00-0x7F:給定的用例Unicode碼為0x41,對應的二進制位:0100 0001,而UTF-8編碼規(guī)則為:0xxxxxxx。故有:
0xxx xxxx
+ 100 0001
0100 0001
所以Unicode編碼0x41轉(zhuǎn)換為UTF-8后為:0x41。
所以對于區(qū)間段0x00-0x7F之間的Unicode和UTF-8編碼是一致的。即與ASCII碼一致(ASCII共規(guī)定了128個字符的編碼)
2,范圍0x80-0x7FF:給定的用例Unicode碼為0x123,對應的二進制為:0001 0010 0011,而UTF-8編碼規(guī)則為:110xxxxx 10xxxxxx。故有:
110x xxxx 10xx xxxx
+ 0 0100 10 0011
1100 0100 1010 0011
所以Unicode編碼0x123轉(zhuǎn)換為UTF-8后為:0xC4A3
3,范圍0x800-0xFFFF:給定的用例Unicode碼為0x4E25,對應的二進制為:0100 1110 0010 0101,而UTF-8編碼規(guī)則為:1110xxxx 10xxxxxx 10xxxxxx,故有:
1110 xxxx 10xx xxxx 10xx xxxx
+ 0100 11 1000 10 0101
1110 0100 1011 1000 1010 0101
所以Unicode編碼為0x4E25轉(zhuǎn)換為UTF-8后為:0xE4B8A5
4,范圍0x10000-0x10FFFF:給定的Unicode碼為0x23456,對應的二進制為:0010 0011 0100 0101 0110,而UTF-8編碼規(guī)則為:111100xx 10xxxxxx 10xxxxxx 10xxxxxx。故有:
1111 00xx 10xx xxxx 10xx xxxx 10xx xxxx
+ 00 10 0011 01 0001 01 0110
1111 0000 1010 0011 1001 0001 1001 0110
所以Unicode編碼為0x23456轉(zhuǎn)換UTF-8后為:0xF0A39196
5,范圍0x200000-0x3FFFFFF:給定的Unicode碼為0x234567,對應的二進制為:0010 0011 0100 0101 0110 0111,UTF-8編碼規(guī)則為:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。故有:
1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx
+ 00 1000 11 0100 01 0101 10 0111
1111 1000 1000 1000 1011 0100 1001 0101 1010 0111
所以Unicode編碼為0x234567轉(zhuǎn)換UTF-8后為:0xF888B495A7
6,范圍0x4000000-0x7FFFFFFF:給定的Unicode碼為0x34561234,對應的二進制為:0011 0100 0101 0110 0001 0010 0011 0100,UTF-8編碼規(guī)則為:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。故有:
1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx
+ 0 11 0100 01 0101 10 0001 00 1000 11 0100
1111 1100 1011 0100 1001 0101 1010 0001 1000 1000 1011 0100
所以Unicode編碼為0x34561234轉(zhuǎn)換UTF-8后為:0xFCB495A188B4
1,通過以上案例分析可得如下單字符Unicode編碼轉(zhuǎn)UTF-8程序為:
1)由于本系統(tǒng)采用大頭方式(Big endian),所以先打出來的是高位的值。
2)實現(xiàn)思路:移動指定的位數(shù)是該字節(jié)處于易于操作的位置或使操作完的值達到指定位置,使用與運算取得指定位上的值,使用或運算實現(xiàn)相加效果。
int unicode_to_utf( unsigned long unicode, unsigned char *utf ) { assert( utf ); int size = 0; if ( unicode <= 0x7F ) { *( utf + size++ ) = unicode & 0x7F; } else if ( unicode >= 0x80 && unicode <= 0x7FF ) { *( utf + size++ ) = ( ( unicode >> 6 ) & 0x1F ) | 0xC0; *( utf + size++ ) = ( unicode & 0x3F ) | 0x80; } else if ( unicode >= 0x800 && unicode <= 0xFFFF ) { *( utf + size++ ) = ( ( unicode >> 12 ) & 0x0F ) | 0xE0; *( utf + size++ ) = ( ( unicode >> 6 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( unicode & 0x3F ) | 0x80; } else if ( unicode >= 0x10000 && unicode <= 0x10FFFF ) { *( utf + size++ ) = ( (unicode >> 18 ) & 0x7 ) | 0xF0; *( utf + size++ ) = ( (unicode >> 12 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 6 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( unicode & 0x3F ) | 0x80; } else if ( unicode >= 0x200000 && unicode <= 0x3FFFFFF ) { *( utf + size++ ) = ( (unicode >> 24 ) & 0x3 ) | 0xF8; *( utf + size++ ) = ( (unicode >> 18 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 12 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 6 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( unicode & 0x3F ) | 0x80; } else if ( unicode >= 0x4000000 && unicode <= 0x7FFFFFFF ) { *( utf + size++ ) = ( (unicode >> 30 ) & 0x1 ) | 0xFC; *( utf + size++ ) = ( (unicode >> 24 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 18 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 12 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( (unicode >> 6 ) & 0x3F ) | 0x80; *( utf + size++ ) = ( unicode & 0x3F ) | 0x80; } else { printf( "Error : unknow scope\n" ); return -1; } *( utf + size ) = '\0'; return size; }
測試用例如下:
int main( int argc, char *argv[] ) { unsigned long unicode1 = 0x55; unsigned char utf[7] = { 0 }; int size = unicode_to_utf( unicode1, utf ); utf_print( utf, size ); unsigned long unicode2 = 0x123; size = unicode_to_utf( unicode2, utf ); utf_print( utf, size ); memset( utf, 0x00, sizeof( utf ) ); unsigned long unicode3 = 0x4E25; size = unicode_to_utf( unicode3, utf ); utf_print( utf, size ); memset( utf, 0x00, sizeof( utf ) ); unsigned long unicode4 = 0x23456; size = unicode_to_utf( unicode4, utf ); utf_print( utf, size ); memset( utf, 0x00, sizeof( utf ) ); unsigned long unicode5 = 0x234567; size = unicode_to_utf( unicode5, utf ); utf_print( utf, size ); memset( utf, 0x00, sizeof( utf ) ); unsigned long unicode6 = 0x34561234; size = unicode_to_utf( unicode6, utf ); utf_print( utf, size ); /* error */ memset( utf, 0x00, sizeof( utf ) ); unsigned long unicode7 = 0x8FFFFFFF; size = unicode_to_utf( unicode7, utf ); utf_print( utf, size ); return 0; }
打印函數(shù)如下:
void utf_print( unsigned char *utf, int size ) { if ( size == -1 ) { printf( "unknow scope\n" ); return; } int index = 0; for ( ; index < size; index += 1 ) { printf( "%X", *( utf + index) ); } printf( "\n" ); }
2,單字符UTF-8編碼轉(zhuǎn)Unicode編碼:
int utf_to_unicode( unsigned long utf, unsigned char *unicode ) { int size = 0; if ( utf <= 0x7F ) { *( unicode + size++ ) = utf & 0x7F; } else if ( utf >= 0xC080 && utf <= 0xCFBF ) { *( unicode + size++ ) = ( ( utf >> 10 ) & 0x07 ); *( unicode + size++ ) = ( utf & 0x3F ) | ( ( ( utf >> 8 ) & 0x03 ) << 6); } else if ( utf >= 0xE08080 && utf <= 0xEFBFBF ) { *( unicode + size++ ) = ( ( utf >> 10 ) & 0x0F ) | (( utf >> 16 ) & 0x0F ) << 4; *( unicode + size++ ) = ( utf & 0x3F ) | ((( utf >> 8 ) & 0x03 ) << 6 ); } else if ( utf >= 0xF0808080 && utf <= 0xF7BFBFBF ) { *( unicode + size++ ) = ( (utf >> 20 ) & 0x03 ) | ((( utf >> 24 ) & 0x07 ) << 2 ); *( unicode + size++ ) = (( utf >> 10 ) & 0x0F ) | ( ( ( utf >> 16 ) & 0x0F ) << 4 ); *( unicode + size++ ) = ( utf & 0x3F ) | ( ( utf >> 8 ) & 0x03 ) << 6; } else if ( utf >= 0xF880808080 && utf <= 0xFBBFBFBFBF ) { *( unicode + size++ ) = ( utf >> 32 ) & 0x03; *( unicode + size++ ) = ( ( utf >> 20 ) & 0x03 ) | ( ( ( utf >> 24 ) & 0x3F ) << 2 ); *( unicode + size++ ) = ( ( utf >> 10 ) & 0x0F ) | ( ( ( utf >> 16 ) & 0x0F ) << 4 ); *( unicode + size++ ) = ( utf & 0x3F ) | ( ( (utf >> 8) & 0x03 ) << 6 ); } else if ( utf >= 0xFC8080808080 && utf <= 0xFDBFBFBFBFBF ) { *( unicode + size++ ) = ( ( utf >> 32 ) & 0x3F ) | ( ( ( utf >> 40 ) & 0x01 ) << 6 ); *( unicode + size++ ) = ( ( utf >> 20 ) & 0x03 ) | ( ( ( utf >> 24 ) & 0x3F ) << 2 ); *( unicode + size++ ) = ( ( utf >> 10 ) & 0x0F ) | ( ( ( utf >> 16 ) & 0x0F ) << 4 ); *( unicode + size++ ) = ( utf & 0x3F ) | ( ( ( utf >> 8 ) & 0x03 ) << 6 ); } else { printf( "Error : unknow scope\n" ); return -1; } *( unicode + size ) = '\0'; return size; }
測試用例:
int main( int argc, char *argv[] ) { unsigned char unicode[9]; int size = 0; memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf1 = 0x55; size = utf_to_unicode( utf1, unicode ); unicode_print( unicode, size ); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf2 = 0xC480; size = utf_to_unicode( utf2, unicode ); unicode_print( unicode, size ); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf3 = 0xE4B8A5; size = utf_to_unicode( utf3, unicode ); unicode_print( unicode, size); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf4 = 0xF0A39196; size = utf_to_unicode( utf4, unicode ); unicode_print( unicode, size); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf5 = 0xF888B495A7; size = utf_to_unicode( utf5, unicode ); unicode_print( unicode, size); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf6 = 0xFCB495A188B4; size = utf_to_unicode( utf6, unicode ); unicode_print( unicode, size); memset( unicode, 0x00, sizeof( unicode ) ); unsigned long utf7 = 0xFEBFBFBFBFBF; size = utf_to_unicode( utf7, unicode ); unicode_print( unicode, size); return 0; }
打印函數(shù)如下:
void unicode_print( unsigned char *unicode, int size ) { if ( size == -1 ) { printf("Error : unknow scope\n"); return; } int index = 0; for ( ; index < size; index += 1 ) { printf( "%02X", *( unicode + index ) ); } printf("\n"); }
本文參考文獻:http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html