00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
00035
00036 static const int EncLen_UTF16[] = {
00037 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054
00055 static int
00056 utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
00057 OnigEncoding enc ARG_UNUSED)
00058 {
00059 int len = (int)(e - p);
00060 UChar byte;
00061 if (len < 2)
00062 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00063 byte = p[1];
00064 if (!UTF16_IS_SURROGATE(byte)) {
00065 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00066 }
00067 if (UTF16_IS_SURROGATE_FIRST(byte)) {
00068 if (len < 4)
00069 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
00070 if (UTF16_IS_SURROGATE_SECOND(p[3]))
00071 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00072 }
00073 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00074 }
00075
00076 static int
00077 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
00078 OnigEncoding enc ARG_UNUSED)
00079 {
00080 if (p + 1 < end) {
00081 if (*p == 0x0a && *(p+1) == 0x00)
00082 return 1;
00083 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00084 if ((
00085 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00086 *p == 0x0d ||
00087 #endif
00088 *p == 0x85) && *(p+1) == 0x00)
00089 return 1;
00090 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
00091 return 1;
00092 #endif
00093 }
00094 return 0;
00095 }
00096
00097 static OnigCodePoint
00098 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00099 OnigEncoding enc ARG_UNUSED)
00100 {
00101 OnigCodePoint code;
00102 UChar c0 = *p;
00103 UChar c1 = *(p+1);
00104
00105 if (UTF16_IS_SURROGATE_FIRST(c1)) {
00106 code = ((((c1 << 8) + c0) & 0x03ff) << 10)
00107 + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
00108 }
00109 else {
00110 code = c1 * 256 + p[0];
00111 }
00112 return code;
00113 }
00114
00115 static int
00116 utf16le_code_to_mbclen(OnigCodePoint code,
00117 OnigEncoding enc ARG_UNUSED)
00118 {
00119 return (code > 0xffff ? 4 : 2);
00120 }
00121
00122 static int
00123 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf,
00124 OnigEncoding enc ARG_UNUSED)
00125 {
00126 UChar* p = buf;
00127
00128 if (code > 0xffff) {
00129 unsigned int high = (code >> 10) + 0xD7C0;
00130 unsigned int low = (code & 0x3FF) + 0xDC00;
00131 *p++ = high & 0xFF;
00132 *p++ = (high >> 8) & 0xFF;
00133 *p++ = low & 0xFF;
00134 *p++ = (low >> 8) & 0xFF;
00135 return 4;
00136 }
00137 else {
00138 *p++ = (UChar )(code & 0xff);
00139 *p++ = (UChar )((code & 0xff00) >> 8);
00140 return 2;
00141 }
00142 }
00143
00144 static int
00145 utf16le_mbc_case_fold(OnigCaseFoldType flag,
00146 const UChar** pp, const UChar* end, UChar* fold,
00147 OnigEncoding enc)
00148 {
00149 const UChar* p = *pp;
00150
00151 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
00152 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00153 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00154 if (*p == 0x49) {
00155 *fold++ = 0x31;
00156 *fold = 0x01;
00157 (*pp) += 2;
00158 return 2;
00159 }
00160 }
00161 #endif
00162
00163 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00164 *fold = 0;
00165 *pp += 2;
00166 return 2;
00167 }
00168 else
00169 return onigenc_unicode_mbc_case_fold(enc, flag, pp,
00170 end, fold);
00171 }
00172
00173 #if 0
00174 static int
00175 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
00176 const UChar* end)
00177 {
00178 const UChar* p = *pp;
00179
00180 (*pp) += EncLen_UTF16[*(p+1)];
00181
00182 if (*(p+1) == 0) {
00183 int c, v;
00184
00185 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00186 return TRUE;
00187 }
00188
00189 c = *p;
00190 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00191 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00192 if ((v | BIT_CTYPE_LOWER) != 0) {
00193
00194 if (c >= 0xaa && c <= 0xba)
00195 return FALSE;
00196 else
00197 return TRUE;
00198 }
00199 return (v != 0 ? TRUE : FALSE);
00200 }
00201
00202 return FALSE;
00203 }
00204 #endif
00205
00206 static UChar*
00207 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00208 OnigEncoding enc ARG_UNUSED)
00209 {
00210 if (s <= start) return (UChar* )s;
00211
00212 if ((s - start) % 2 == 1) {
00213 s--;
00214 }
00215
00216 if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
00217 s -= 2;
00218
00219 return (UChar* )s;
00220 }
00221
00222 static int
00223 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00224 const OnigUChar* p, const OnigUChar* end,
00225 OnigCaseFoldCodeItem items[],
00226 OnigEncoding enc)
00227 {
00228 return onigenc_unicode_get_case_fold_codes_by_str(enc,
00229 flag, p, end, items);
00230 }
00231
00232 OnigEncodingDefine(utf_16le, UTF_16LE) = {
00233 utf16le_mbc_enc_len,
00234 "UTF-16LE",
00235 4,
00236 2,
00237 utf16le_is_mbc_newline,
00238 utf16le_mbc_to_code,
00239 utf16le_code_to_mbclen,
00240 utf16le_code_to_mbc,
00241 utf16le_mbc_case_fold,
00242 onigenc_unicode_apply_all_case_fold,
00243 utf16le_get_case_fold_codes_by_str,
00244 onigenc_unicode_property_name_to_ctype,
00245 onigenc_unicode_is_code_ctype,
00246 onigenc_utf16_32_get_ctype_code_range,
00247 utf16le_left_adjust_char_head,
00248 onigenc_always_false_is_allowed_reverse_match
00249 };
00250