00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 static const int EncLen_BIG5[] = {
00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00049 };
00050 static const int EncLen_BIG5_HKSCS[] = {
00051
00052 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00053 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00060 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00061 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00062 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00063 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00064 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00065 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00066 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00067 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
00068 };
00069
00070 static const int EncLen_BIG5_UAO[] = {
00071
00072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00080 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00081 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00083 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00084 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00087 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
00088 };
00089
00090 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00091 #define A ACCEPT
00092 #define F FAILURE
00093 static const signed char trans[][0x100] = {
00094 {
00095 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00096 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00097 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00098 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00099 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00111 },
00112 {
00113 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00115 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00116 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00117 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00118 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00119 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00120 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00121 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00122 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00123 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00124 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00125 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00126 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00127 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00128 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00129 },
00130 {
00131 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00132 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00133 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00134 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00135 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00136 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00137 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00138 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00139 F, F, F, F, F, F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00141 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00147 }
00148 };
00149 #undef A
00150 #undef F
00151
00152 static int
00153 big5_mbc_enc_len0(const UChar* p, const UChar* e, int tridx, const int tbl[])
00154 {
00155 int firstbyte = *p++;
00156 state_t s = trans[tridx][firstbyte];
00157 #define RETURN(n) \
00158 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
00159 ONIGENC_CONSTRUCT_MBCLEN_INVALID()
00160 if (s < 0) RETURN(1);
00161 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(tbl[firstbyte]-1);
00162 s = trans[s][*p++];
00163 RETURN(2);
00164 #undef RETURN
00165 }
00166
00167 static int
00168 big5_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00169 {
00170 return big5_mbc_enc_len0(p, e, 0, EncLen_BIG5);
00171 }
00172
00173 static int
00174 big5_hkscs_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00175 {
00176 return big5_mbc_enc_len0(p, e, 2, EncLen_BIG5_HKSCS);
00177 }
00178
00179 static int
00180 big5_uao_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00181 {
00182 return big5_mbc_enc_len0(p, e, 2, EncLen_BIG5_UAO);
00183 }
00184
00185 static OnigCodePoint
00186 big5_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00187 {
00188 return onigenc_mbn_mbc_to_code(enc, p, end);
00189 }
00190
00191 static int
00192 big5_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00193 {
00194 return onigenc_mb2_code_to_mbc(enc, code, buf);
00195 }
00196
00197 static int
00198 big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
00199 UChar* lower, OnigEncoding enc)
00200 {
00201 return onigenc_mbn_mbc_case_fold(enc, flag,
00202 pp, end, lower);
00203 }
00204
00205 #if 0
00206 static int
00207 big5_is_mbc_ambiguous(OnigCaseFoldType flag,
00208 const UChar** pp, const UChar* end, OnigEncoding enc)
00209 {
00210 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211 }
00212 #endif
00213
00214 static int
00215 big5_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00216 {
00217 return onigenc_mb2_is_code_ctype(enc, code, ctype);
00218 }
00219
00220 static const char BIG5_CAN_BE_TRAIL_TABLE[256] = {
00221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00225 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00226 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
00237 };
00238
00239 #define BIG5_HKSCS_P(enc) ((enc)->precise_mbc_enc_len == big5_hkscs_mbc_enc_len)
00240 #define BIG5_UAO_P(enc) ((enc)->precise_mbc_enc_len == big5_uao_mbc_enc_len)
00241
00242 #define BIG5_ISMB_FIRST(byte) ( \
00243 BIG5_HKSCS_P(enc) ? EncLen_BIG5_HKSCS[byte] > 1 : \
00244 EncLen_BIG5[byte] > 1 \
00245 )
00246 #define BIG5_ISMB_TRAIL(byte) BIG5_CAN_BE_TRAIL_TABLE[(byte)]
00247
00248 static UChar*
00249 big5_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00250 {
00251 const UChar *p;
00252 int len;
00253
00254 if (s <= start) return (UChar* )s;
00255 p = s;
00256
00257 if (BIG5_ISMB_TRAIL(*p)) {
00258 while (p > start) {
00259 if (! BIG5_ISMB_FIRST(*--p)) {
00260 p++;
00261 break;
00262 }
00263 }
00264 }
00265 len = enclen(enc, p, end);
00266 if (p + len > s) return (UChar* )p;
00267 p += len;
00268 return (UChar* )(p + ((s - p) & ~1));
00269 }
00270
00271 static int
00272 big5_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
00273 {
00274 const UChar c = *s;
00275
00276 return (BIG5_ISMB_TRAIL(c) ? FALSE : TRUE);
00277 }
00278
00279
00280
00281
00282
00283
00284
00285
00286 OnigEncodingDefine(big5, BIG5) = {
00287 big5_mbc_enc_len,
00288 "Big5",
00289 2,
00290 1,
00291 onigenc_is_mbc_newline_0x0a,
00292 big5_mbc_to_code,
00293 onigenc_mb2_code_to_mbclen,
00294 big5_code_to_mbc,
00295 big5_mbc_case_fold,
00296 onigenc_ascii_apply_all_case_fold,
00297 onigenc_ascii_get_case_fold_codes_by_str,
00298 onigenc_minimum_property_name_to_ctype,
00299 big5_is_code_ctype,
00300 onigenc_not_support_get_ctype_code_range,
00301 big5_left_adjust_char_head,
00302 big5_is_allowed_reverse_match
00303 };
00304
00305
00306
00307
00308
00309 ENC_REPLICATE("CP950", "Big5")
00310
00311
00312
00313
00314
00315
00316
00317
00318 OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = {
00319 big5_hkscs_mbc_enc_len,
00320 "Big5-HKSCS",
00321 2,
00322 1,
00323 onigenc_is_mbc_newline_0x0a,
00324 big5_mbc_to_code,
00325 onigenc_mb2_code_to_mbclen,
00326 big5_code_to_mbc,
00327 big5_mbc_case_fold,
00328 onigenc_ascii_apply_all_case_fold,
00329 onigenc_ascii_get_case_fold_codes_by_str,
00330 onigenc_minimum_property_name_to_ctype,
00331 big5_is_code_ctype,
00332 onigenc_not_support_get_ctype_code_range,
00333 big5_left_adjust_char_head,
00334 big5_is_allowed_reverse_match
00335 };
00336 ENC_ALIAS("Big5-HKSCS:2008", "Big5-HKSCS")
00337
00338
00339
00340
00341
00342
00343
00344 ENC_REPLICATE("CP951", "Big5-HKSCS")
00345
00346
00347
00348
00349
00350 OnigEncodingDefine(big5_uao, BIG5_UAO) = {
00351 big5_uao_mbc_enc_len,
00352 "Big5-UAO",
00353 2,
00354 1,
00355 onigenc_is_mbc_newline_0x0a,
00356 big5_mbc_to_code,
00357 onigenc_mb2_code_to_mbclen,
00358 big5_code_to_mbc,
00359 big5_mbc_case_fold,
00360 onigenc_ascii_apply_all_case_fold,
00361 onigenc_ascii_get_case_fold_codes_by_str,
00362 onigenc_minimum_property_name_to_ctype,
00363 big5_is_code_ctype,
00364 onigenc_not_support_get_ctype_code_range,
00365 big5_left_adjust_char_head,
00366 big5_is_allowed_reverse_match
00367 };
00368