coreBugZJ

此 blog 已弃。

Modified UTF-8 与 UTF-32 相互转换


自己的实现,经过一定的测试。

头文件
 1/*
 2Convert Modified UTF-8  <==>  UTF-32.
 3*/

 4
 5
 6/*
 7function : Convert Modified UTF-8 to UTF-32.
 8input : str_mutf8, a null terminated string in Modified UTF-8.
 9output : str_utf32, a null terminated string in UTF-32.
10input : str_utf32_limit, the max length(character count) 
11        of str_utf32 plus one(for 'null'), str_utf32 must have enough space 
12        for str_utf32_limit characters.
13return : -1 for errors; 
14        else the length(character count) of str_utf32, 
15                maybe larger than (str_utf32_limit-1) if the space 
16                of str_utf32 isn't enougn.
17note : convert 0xc080 to U+0000 字符串未结束
18        convert 0x00 to U+0000 字符串结束
19*/

20int mutf8_to_utf32( const unsigned char *str_mutf8, 
21                unsigned int *str_utf32, int str_utf32_limit );
22
23/*
24function : Convert UTF-32 to Modified UTF-8.
25input : str_utf32, a null terminated string in UTF-32.
26output : str_mutf8, a null terminated string in Modified UTF-8.
27input : str_mutf8_limit, the max length(byte count) 
28        of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space 
29        for str_mutf8_limit bytes.
30return : -1 for errors; 
31        else the length(byte count) of str_mutf8, 
32                maybe larger than (str_mutf8_limit-1) if the space 
33                of str_mutf8 isn't enougn.
34note : convet U+0000 to 0x00, not 0xc080 字符串结束
35*/

36int utf32_to_mutf8( const unsigned int *str_utf32, 
37                unsigned char *str_mutf8, int str_mutf8_limit );
38
39


C代码
  1/*
  2Convert Modified UTF-8  <==>  UTF-32.
  3*/

  4
  5
  6#include "cvt_mutf8_utf32.h"
  7#include <stdio.h> 
  8
  9
 10/*
 11A U+0001 to U+007F
 120+++ ++++ u &0x80 => 0x00
 13
 14B U+0080 to U+07FF, and null character (U+0000)
 15110+ ++++ u &0xe0 => 0xc0
 1610++ ++++ v &0xc0 => 0x80
 17((u & 0x1f) << 6) + (v & 0x3f)
 18
 19C U+0800 to U+FFFF
 201110 ++++ u &0xf0 => 0xe0
 2110++ ++++ v &0xc0 => 0x80
 2210++ ++++ w &0xc0 => 0x80
 23((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
 24
 25D above U+FFFF (U+10000 to U+10FFFF)
 261110 1101 u &0xff => 0xed
 271010 ++++ v &0xf0 => 0xa0
 2810++ ++++ w &0xc0 => 0x80
 291110 1101 x &0xff => 0xed
 301011 ++++ y &0xf0 => 0xb0
 3110++ ++++ z &0xc0 => 0x80
 320x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) 
 33*/

 34
 35int mutf8_to_utf32( const unsigned char *str_mutf8, 
 36                unsigned int *str_utf32, int str_utf32_limit ) {
 37        unsigned int cod, u, v, w, x, y, z;
 38        int len32 = 0;
 39        if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
 40                return (-1);
 41        }

 42
 43#define  __ADD_UTF32_COD_Z__   do {\
 44                if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
 45                        str_utf32[ len32 ] = cod;\
 46                }
\
 47                ++len32;\
 48        }
 while ( 0 )
 49
 50        for ( ; ; ) {
 51                u = *str_mutf8++;
 52
 53                if ( 0 == u ) {
 54                        break;
 55                }

 56
 57                if ( 0x00 == (0x80 & u)  ) {
 58                        cod = u;
 59                        __ADD_UTF32_COD_Z__;
 60                        continue;
 61                }

 62
 63                if ( 0xc0 == (0xe0 & u) ) {
 64                        v = *str_mutf8++;
 65                        if ( 0x80 != (0xc0 & v) ) {
 66                                return (-1);
 67                        }

 68                        cod =   ((u&0x1f)<<6| 
 69                                (v&0x3f);
 70                        __ADD_UTF32_COD_Z__;
 71                        continue;
 72                }

 73
 74                if ( 0xe0 == (0xf0 & u) ) {
 75                        v = *str_mutf8++;
 76                        if ( 0x80 != (0xc0 & v) ) {
 77                                return (-1);
 78                        }

 79                        w = *str_mutf8++;
 80                        if ( 0x80 != (0xc0 & w) ) {
 81                                return (-1);
 82                        }

 83                        if (    (0xed == (0xff & u)) && 
 84                                (0xa0 == (0xf0 & v)) && 
 85                                (0x80 == (0xc0 & w)) 
 86                        ) {
 87                                x = *str_mutf8++;
 88                                if ( 0xed != (0xff & x) ) {
 89                                        return (-1);
 90                                }

 91                                y = *str_mutf8++;
 92                                if ( 0xb0 != (0xf0 & y) ) {
 93                                        return (-1);
 94                                }

 95                                z = *str_mutf8++;
 96                                if ( 0x80 != (0xc0 & z) ) {
 97                                        return (-1);
 98                                }

 99                                cod =   0x10000 + (
100                                        ((v&0x0f)<<16| 
101                                        ((w&0x3f)<<10| 
102                                        ((y&0x0f)<<6| 
103                                        (z&0x3f) );
104                                __ADD_UTF32_COD_Z__;
105                                continue;
106                        }

107                        cod =   ((u&0xf)<<12| 
108                                ((v&0x3f)<<6| 
109                                (w&0x3f);
110                        __ADD_UTF32_COD_Z__;
111                        continue;
112                }

113
114                return (-1);
115        }

116
117        if ( NULL == str_utf32 ) {
118        }

119        else if ( len32 < str_utf32_limit ) {
120                str_utf32[ len32 ] = 0;
121        }

122        else {
123                str_utf32[ str_utf32_limit-1 ] = 0;
124        }

125
126        return len32;
127#undef __ADD_UTF32_COD_Z__
128}
129
130int utf32_to_mutf8( const unsigned int *str_utf32, 
131                unsigned char *str_mutf8, int str_mutf8_limit ) {
132        unsigned int cod;
133        int len8 = 0;
134        if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135                return (-1);
136        }

137
138#define __ADD_MUTF8_B_Z__(b)   do {\
139                if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140                        str_mutf8[ len8 ] = (unsigned char)(b);\
141                }
\
142                ++len8;\
143        }
 while ( 0 )
144
145        for ( ; ; ) {
146                cod = *str_utf32++;
147
148                if ( 0 == cod ) {
149                        break;
150                }

151
152                if ( 0x007f >= cod ) {
153                        __ADD_MUTF8_B_Z__(cod);
154                        continue;
155                }

156
157                if ( 0x07ff >= cod ) {
158                        __ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160                        continue;
161                }

162
163                if ( 0xffff >= cod ) {
164                        __ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165                        __ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167                        continue;
168                }

169
170                if ( 0x10ffff >= cod ) {
171                        cod -= 0x10000;
172                        __ADD_MUTF8_B_Z__(0xed);
173                        __ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174                        __ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175                        __ADD_MUTF8_B_Z__(0xed);
176                        __ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178                        continue;
179                }

180
181                return (-1);
182        }

183
184        if ( NULL == str_mutf8 ) {
185        }

186        else if ( len8 < str_mutf8_limit ) {
187                str_mutf8[ len8 ] = 0;
188        }

189        else {
190                str_mutf8[ str_mutf8_limit-1 ] = 0;
191        }

192
193        return len8;
194#undef __ADD_MUTF8_B_Z__
195}
196
197

posted on 2014-04-13 19:42 coreBugZJ 阅读(950) 评论(0)  编辑 收藏 引用 所属分类: 技术视野


只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   博问   Chat2DB   管理