00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regint.h"
00031
00032
00033 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
00034
00035 static const int EncLen_EUCJP[] = {
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00052 };
00053
00054 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
00055 #define A ACCEPT
00056 #define F FAILURE
00057 static const signed char trans[][0x100] = {
00058 {
00059 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00060 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00061 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00062 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00063 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00064 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00065 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00066 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00067 F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
00068 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00069 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00075 },
00076 {
00077 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00078 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00079 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00080 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00081 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00082 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00083 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00085 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00086 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00087 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00088 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00093 },
00094 {
00095 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00101 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00102 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00103 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00111 },
00112
00113 };
00114 #undef A
00115 #undef F
00116
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120 int firstbyte = *p++;
00121 state_t s;
00122 s = trans[0][firstbyte];
00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
00126 s = trans[s][*p++];
00127 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
00130 s = trans[s][*p++];
00131 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00132 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00133 }
00134
00135 static OnigCodePoint
00136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00137 {
00138 int c, i, len;
00139 OnigCodePoint n;
00140
00141 len = enclen(enc, p, end);
00142 n = (OnigCodePoint )*p++;
00143 if (len == 1) return n;
00144
00145 for (i = 1; i < len; i++) {
00146 if (p >= end) break;
00147 c = *p++;
00148 n <<= 8; n += c;
00149 }
00150 return n;
00151 }
00152
00153 static int
00154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00155 {
00156 if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00157 else if (code > 0xffffff)
00158 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00159 else if (code & 0x800000) return 3;
00160 else if (code & 0x8000) return 2;
00161 else
00162 return ONIGERR_INVALID_CODE_POINT_VALUE;
00163 }
00164
00165 #if 0
00166 static int
00167 code_to_mbc_first(OnigCodePoint code)
00168 {
00169 int first;
00170
00171 if ((code & 0xff0000) != 0) {
00172 first = (code >> 16) & 0xff;
00173 }
00174 else if ((code & 0xff00) != 0) {
00175 first = (code >> 8) & 0xff;
00176 }
00177 else {
00178 return (int )code;
00179 }
00180 return first;
00181 }
00182 #endif
00183
00184 static int
00185 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00186 {
00187 UChar *p = buf;
00188
00189 if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00190 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
00191 *p++ = (UChar )(code & 0xff);
00192
00193 #if 1
00194 if (enclen(enc, buf, p) != (p - buf))
00195 return ONIGERR_INVALID_CODE_POINT_VALUE;
00196 #endif
00197 return (int)(p - buf);
00198 }
00199
00200 static int
00201 mbc_case_fold(OnigCaseFoldType flag,
00202 const UChar** pp, const UChar* end, UChar* lower,
00203 OnigEncoding enc)
00204 {
00205 int len;
00206 const UChar* p = *pp;
00207
00208 if (ONIGENC_IS_MBC_ASCII(p)) {
00209 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00210 (*pp)++;
00211 return 1;
00212 }
00213 else {
00214 int i;
00215
00216 len = enclen(enc, p, end);
00217 for (i = 0; i < len; i++) {
00218 *lower++ = *p++;
00219 }
00220 (*pp) += len;
00221 return len;
00222 }
00223 }
00224
00225 static UChar*
00226 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00227 {
00228
00229
00230
00231 const UChar *p;
00232 int len;
00233
00234 if (s <= start) return (UChar* )s;
00235 p = s;
00236
00237 while (!eucjp_islead(*p) && p > start) p--;
00238 len = enclen(enc, p, end);
00239 if (p + len > s) return (UChar* )p;
00240 p += len;
00241 return (UChar* )(p + ((s - p) & ~1));
00242 }
00243
00244 static int
00245 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00246 {
00247 const UChar c = *s;
00248 if (c <= 0x7e || c == 0x8e || c == 0x8f)
00249 return TRUE;
00250 else
00251 return FALSE;
00252 }
00253
00254
00255 static int PropertyInited = 0;
00256 static const OnigCodePoint** PropertyList;
00257 static int PropertyListNum;
00258 static int PropertyListSize;
00259 static hash_table_type* PropertyNameTable;
00260
00261 static const OnigCodePoint CR_Hiragana[] = {
00262 1,
00263 0xa4a1, 0xa4f3
00264 };
00265
00266 static const OnigCodePoint CR_Katakana[] = {
00267 3,
00268 0xa5a1, 0xa5f6,
00269 0xaaa6, 0xaaaf,
00270 0xaab1, 0xaadd
00271 };
00272
00273 static int
00274 init_property_list(void)
00275 {
00276 int r;
00277
00278 PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
00279 PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
00280 PropertyInited = 1;
00281
00282 end:
00283 return r;
00284 }
00285
00286 static int
00287 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00288 {
00289 st_data_t ctype;
00290 UChar *s, *e;
00291
00292 PROPERTY_LIST_INIT_CHECK;
00293
00294 s = e = ALLOCA_N(UChar, end-p+1);
00295 for (; p < end; p++) {
00296 *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00297 }
00298
00299 if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
00300 return onigenc_minimum_property_name_to_ctype(enc, s, e);
00301 }
00302
00303 return (int)ctype;
00304 }
00305
00306 static int
00307 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00308 {
00309 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00310 if (code < 128)
00311 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00312 else {
00313 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00314 return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
00315 }
00316 }
00317 }
00318 else {
00319 PROPERTY_LIST_INIT_CHECK;
00320
00321 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00322 if (ctype >= (unsigned int )PropertyListNum)
00323 return ONIGERR_TYPE_BUG;
00324
00325 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00326 }
00327
00328 return FALSE;
00329 }
00330
00331 static int
00332 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00333 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00334 {
00335 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00336 return ONIG_NO_SUPPORT_CONFIG;
00337 }
00338 else {
00339 *sb_out = 0x80;
00340
00341 PROPERTY_LIST_INIT_CHECK;
00342
00343 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00344 if (ctype >= (OnigCtype )PropertyListNum)
00345 return ONIGERR_TYPE_BUG;
00346
00347 *ranges = PropertyList[ctype];
00348 return 0;
00349 }
00350 }
00351
00352
00353 OnigEncodingDefine(euc_jp, EUC_JP) = {
00354 mbc_enc_len,
00355 "EUC-JP",
00356 3,
00357 1,
00358 onigenc_is_mbc_newline_0x0a,
00359 mbc_to_code,
00360 code_to_mbclen,
00361 code_to_mbc,
00362 mbc_case_fold,
00363 onigenc_ascii_apply_all_case_fold,
00364 onigenc_ascii_get_case_fold_codes_by_str,
00365 property_name_to_ctype,
00366 is_code_ctype,
00367 get_ctype_code_range,
00368 left_adjust_char_head,
00369 is_allowed_reverse_match,
00370 0
00371 };
00372
00373
00374
00375
00376
00377
00378
00379 ENC_ALIAS("eucJP", "EUC-JP")
00380
00381
00382
00383
00384
00385
00386
00387 ENC_REPLICATE("eucJP-ms", "EUC-JP")
00388 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398 ENC_REPLICATE("CP51932", "EUC-JP")
00399