00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regint.h"
00031
00032 static const int EncLen_SJIS[] = {
00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
00049 };
00050
00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
00052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00068 };
00069
00070 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
00071 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
00072
00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00074 #define A ACCEPT
00075 #define F FAILURE
00076 static const signed char trans[][0x100] = {
00077 {
00078 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00082 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00083 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00084 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00085 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00086 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00088 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
00094 },
00095 {
00096 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00104 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00107 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00108 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00109 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00110 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00111 A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
00112 }
00113 };
00114 #undef A
00115 #undef F
00116
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120 int firstbyte = *p++;
00121 state_t s;
00122 s = trans[0][firstbyte];
00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
00126 s = trans[s][*p++];
00127 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 }
00130
00131 static int
00132 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00133 {
00134 if (code < 256) {
00135 if (EncLen_SJIS[(int )code] == 1)
00136 return 1;
00137 else
00138 return ONIGERR_INVALID_CODE_POINT_VALUE;
00139 }
00140 else if (code <= 0xffff) {
00141 return 2;
00142 }
00143 else
00144 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00145 }
00146
00147 static OnigCodePoint
00148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00149 {
00150 int c, i, len;
00151 OnigCodePoint n;
00152
00153 len = enclen(enc, p, end);
00154 c = *p++;
00155 n = c;
00156 if (len == 1) return n;
00157
00158 for (i = 1; i < len; i++) {
00159 if (p >= end) break;
00160 c = *p++;
00161 n <<= 8; n += c;
00162 }
00163 return n;
00164 }
00165
00166 static int
00167 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00168 {
00169 UChar *p = buf;
00170
00171 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
00172 *p++ = (UChar )(code & 0xff);
00173
00174 #if 0
00175 if (enclen(enc, buf) != (p - buf))
00176 return REGERR_INVALID_CODE_POINT_VALUE;
00177 #endif
00178 return (int)(p - buf);
00179 }
00180
00181 static int
00182 mbc_case_fold(OnigCaseFoldType flag,
00183 const UChar** pp, const UChar* end, UChar* lower,
00184 OnigEncoding enc)
00185 {
00186 const UChar* p = *pp;
00187
00188 if (ONIGENC_IS_MBC_ASCII(p)) {
00189 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00190 (*pp)++;
00191 return 1;
00192 }
00193 else {
00194 int i;
00195 int len = enclen(enc, p, end);
00196
00197 for (i = 0; i < len; i++) {
00198 *lower++ = *p++;
00199 }
00200 (*pp) += len;
00201 return len;
00202 }
00203 }
00204
00205 #if 0
00206 static int
00207 is_mbc_ambiguous(OnigCaseFoldType flag,
00208 const UChar** pp, const UChar* end)
00209 {
00210 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211
00212 }
00213 #endif
00214
00215 #if 0
00216 static int
00217 is_code_ctype(OnigCodePoint code, unsigned int ctype)
00218 {
00219 if (code < 128)
00220 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00221 else {
00222 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00223 return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
00224 }
00225 }
00226
00227 return FALSE;
00228 }
00229 #endif
00230
00231 static UChar*
00232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00233 {
00234 const UChar *p;
00235 int len;
00236
00237 if (s <= start) return (UChar* )s;
00238 p = s;
00239
00240 if (SJIS_ISMB_TRAIL(*p)) {
00241 while (p > start) {
00242 if (! SJIS_ISMB_FIRST(*--p)) {
00243 p++;
00244 break;
00245 }
00246 }
00247 }
00248 len = enclen(enc, p, end);
00249 if (p + len > s) return (UChar* )p;
00250 p += len;
00251 return (UChar* )(p + ((s - p) & ~1));
00252 }
00253
00254 static int
00255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00256 {
00257 const UChar c = *s;
00258 return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
00259 }
00260
00261
00262 static int PropertyInited = 0;
00263 static const OnigCodePoint** PropertyList;
00264 static int PropertyListNum;
00265 static int PropertyListSize;
00266 static hash_table_type* PropertyNameTable;
00267
00268 static const OnigCodePoint CR_Hiragana[] = {
00269 1,
00270 0x829f, 0x82f1
00271 };
00272
00273 static const OnigCodePoint CR_Katakana[] = {
00274 4,
00275 0x00a6, 0x00af,
00276 0x00b1, 0x00dd,
00277 0x8340, 0x837e,
00278 0x8380, 0x8396,
00279 };
00280
00281 static int
00282 init_property_list(void)
00283 {
00284 int r;
00285
00286 PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
00287 PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
00288 PropertyInited = 1;
00289
00290 end:
00291 return r;
00292 }
00293
00294 static int
00295 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00296 {
00297 hash_data_type ctype;
00298 UChar *s, *e;
00299
00300 PROPERTY_LIST_INIT_CHECK;
00301
00302 s = e = ALLOCA_N(UChar, end-p+1);
00303 for (; p < end; p++) {
00304 *e++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00305 }
00306
00307 if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
00308 return onigenc_minimum_property_name_to_ctype(enc, s, e);
00309 }
00310
00311 return (int)ctype;
00312 }
00313
00314 static int
00315 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00316 {
00317 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00318 if (code < 128)
00319 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00320 else {
00321 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00322 return TRUE;
00323 }
00324 }
00325 }
00326 else {
00327 PROPERTY_LIST_INIT_CHECK;
00328
00329 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00330 if (ctype >= (unsigned int )PropertyListNum)
00331 return ONIGERR_TYPE_BUG;
00332
00333 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00334 }
00335
00336 return FALSE;
00337 }
00338
00339 static int
00340 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00341 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00342 {
00343 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00344 return ONIG_NO_SUPPORT_CONFIG;
00345 }
00346 else {
00347 *sb_out = 0x80;
00348
00349 PROPERTY_LIST_INIT_CHECK;
00350
00351 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00352 if (ctype >= (OnigCtype )PropertyListNum)
00353 return ONIGERR_TYPE_BUG;
00354
00355 *ranges = PropertyList[ctype];
00356 return 0;
00357 }
00358 }
00359
00360 OnigEncodingDefine(shift_jis, Shift_JIS) = {
00361 mbc_enc_len,
00362 "Shift_JIS",
00363 2,
00364 1,
00365 onigenc_is_mbc_newline_0x0a,
00366 mbc_to_code,
00367 code_to_mbclen,
00368 code_to_mbc,
00369 mbc_case_fold,
00370 onigenc_ascii_apply_all_case_fold,
00371 onigenc_ascii_get_case_fold_codes_by_str,
00372 property_name_to_ctype,
00373 is_code_ctype,
00374 get_ctype_code_range,
00375 left_adjust_char_head,
00376 is_allowed_reverse_match,
00377 0
00378 };
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398 ENC_REPLICATE("Windows-31J", "Shift_JIS")
00399 ENC_ALIAS("CP932", "Windows-31J")
00400 ENC_ALIAS("csWindows31J", "Windows-31J")
00401 ENC_ALIAS("SJIS", "Windows-31J")
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413 ENC_ALIAS("PCK", "Windows-31J")
00414
00415
00416
00417
00418
00419
00420 ENC_REPLICATE("MacJapanese", "Shift_JIS")
00421 ENC_ALIAS("MacJapan", "MacJapanese")
00422