00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "regparse.h"
00032
00033 #define WARN_BUFSIZE 256
00034
00035 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
00036
00037
00038 const OnigSyntaxType OnigSyntaxRuby = {
00039 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
00040 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
00041 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
00042 ONIG_SYN_OP_ESC_C_CONTROL )
00043 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
00044 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
00045 ONIG_SYN_OP2_OPTION_RUBY |
00046 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
00047 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
00048 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
00049 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
00050 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
00051 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
00052 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
00053 ONIG_SYN_OP2_ESC_H_XDIGIT )
00054 , ( SYN_GNU_REGEX_BV |
00055 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
00056 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
00057 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
00058 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
00059 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
00060 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
00061 ONIG_SYN_WARN_CC_DUP |
00062 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
00063 , ONIG_OPTION_NONE
00064 ,
00065 {
00066 (OnigCodePoint )'\\'
00067 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00068 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00069 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00070 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00071 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR
00072 }
00073 };
00074
00075 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
00076
00077 extern void onig_null_warn(const char* s ARG_UNUSED) { }
00078
00079 #ifdef DEFAULT_WARN_FUNCTION
00080 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
00081 #else
00082 static OnigWarnFunc onig_warn = onig_null_warn;
00083 #endif
00084
00085 #ifdef DEFAULT_VERB_WARN_FUNCTION
00086 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
00087 #else
00088 static OnigWarnFunc onig_verb_warn = onig_null_warn;
00089 #endif
00090
00091 extern void onig_set_warn_func(OnigWarnFunc f)
00092 {
00093 onig_warn = f;
00094 }
00095
00096 extern void onig_set_verb_warn_func(OnigWarnFunc f)
00097 {
00098 onig_verb_warn = f;
00099 }
00100
00101 static void CC_DUP_WARN(ScanEnv *env);
00102
00103 static void
00104 bbuf_free(BBuf* bbuf)
00105 {
00106 if (IS_NOT_NULL(bbuf)) {
00107 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
00108 xfree(bbuf);
00109 }
00110 }
00111
00112 static int
00113 bbuf_clone(BBuf** rto, BBuf* from)
00114 {
00115 int r;
00116 BBuf *to;
00117
00118 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
00119 CHECK_NULL_RETURN_MEMERR(to);
00120 r = BBUF_INIT(to, from->alloc);
00121 if (r != 0) return r;
00122 to->used = from->used;
00123 xmemcpy(to->p, from->p, from->used);
00124 return 0;
00125 }
00126
00127 #define BACKREF_REL_TO_ABS(rel_no, env) \
00128 ((env)->num_mem + 1 + (rel_no))
00129
00130 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
00131
00132 #define MBCODE_START_POS(enc) \
00133 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
00134
00135 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
00136 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
00137
00138 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
00139 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
00140 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
00141 if (r) return r;\
00142 }\
00143 } while (0)
00144
00145
00146 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
00147 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \
00148 BS_ROOM(bs, pos) |= BS_BIT(pos); \
00149 } while (0)
00150
00151 #define BITSET_IS_EMPTY(bs,empty) do {\
00152 int i;\
00153 empty = 1;\
00154 for (i = 0; i < (int )BITSET_SIZE; i++) {\
00155 if ((bs)[i] != 0) {\
00156 empty = 0; break;\
00157 }\
00158 }\
00159 } while (0)
00160
00161 static void
00162 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
00163 {
00164 int i;
00165 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
00166 BITSET_SET_BIT_CHKDUP(bs, i);
00167 }
00168 }
00169
00170 #if 0
00171 static void
00172 bitset_set_all(BitSetRef bs)
00173 {
00174 int i;
00175 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
00176 }
00177 #endif
00178
00179 static void
00180 bitset_invert(BitSetRef bs)
00181 {
00182 int i;
00183 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
00184 }
00185
00186 static void
00187 bitset_invert_to(BitSetRef from, BitSetRef to)
00188 {
00189 int i;
00190 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
00191 }
00192
00193 static void
00194 bitset_and(BitSetRef dest, BitSetRef bs)
00195 {
00196 int i;
00197 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
00198 }
00199
00200 static void
00201 bitset_or(BitSetRef dest, BitSetRef bs)
00202 {
00203 int i;
00204 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
00205 }
00206
00207 static void
00208 bitset_copy(BitSetRef dest, BitSetRef bs)
00209 {
00210 int i;
00211 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
00212 }
00213
00214 extern int
00215 onig_strncmp(const UChar* s1, const UChar* s2, int n)
00216 {
00217 int x;
00218
00219 while (n-- > 0) {
00220 x = *s2++ - *s1++;
00221 if (x) return x;
00222 }
00223 return 0;
00224 }
00225
00226 extern void
00227 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
00228 {
00229 ptrdiff_t len = end - src;
00230 if (len > 0) {
00231 xmemcpy(dest, src, len);
00232 dest[len] = (UChar )0;
00233 }
00234 }
00235
00236 #ifdef USE_NAMED_GROUP
00237 static UChar*
00238 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
00239 {
00240 ptrdiff_t slen;
00241 int term_len, i;
00242 UChar *r;
00243
00244 slen = end - s;
00245 term_len = ONIGENC_MBC_MINLEN(enc);
00246
00247 r = (UChar* )xmalloc(slen + term_len);
00248 CHECK_NULL_RETURN(r);
00249 xmemcpy(r, s, slen);
00250
00251 for (i = 0; i < term_len; i++)
00252 r[slen + i] = (UChar )0;
00253
00254 return r;
00255 }
00256 #endif
00257
00258
00259 #define PEND_VALUE 0
00260
00261 #define PFETCH_READY UChar* pfetch_prev
00262 #define PEND (p < end ? 0 : 1)
00263 #define PUNFETCH p = pfetch_prev
00264 #define PINC do { \
00265 pfetch_prev = p; \
00266 p += enclen(enc, p, end); \
00267 } while (0)
00268 #define PFETCH(c) do { \
00269 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
00270 pfetch_prev = p; \
00271 p += enclen(enc, p, end); \
00272 } while (0)
00273
00274 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
00275 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
00276
00277 static UChar*
00278 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
00279 size_t capa)
00280 {
00281 UChar* r;
00282
00283 if (dest)
00284 r = (UChar* )xrealloc(dest, capa + 1);
00285 else
00286 r = (UChar* )xmalloc(capa + 1);
00287
00288 CHECK_NULL_RETURN(r);
00289 onig_strcpy(r + (dest_end - dest), src, src_end);
00290 return r;
00291 }
00292
00293
00294 static UChar*
00295 strcat_capa_from_static(UChar* dest, UChar* dest_end,
00296 const UChar* src, const UChar* src_end, size_t capa)
00297 {
00298 UChar* r;
00299
00300 r = (UChar* )xmalloc(capa + 1);
00301 CHECK_NULL_RETURN(r);
00302 onig_strcpy(r, dest, dest_end);
00303 onig_strcpy(r + (dest_end - dest), src, src_end);
00304 return r;
00305 }
00306
00307
00308 #ifdef USE_ST_LIBRARY
00309
00310 #include "ruby/st.h"
00311
00312 typedef struct {
00313 const UChar* s;
00314 const UChar* end;
00315 } st_str_end_key;
00316
00317 static int
00318 str_end_cmp(st_data_t xp, st_data_t yp)
00319 {
00320 const st_str_end_key *x, *y;
00321 const UChar *p, *q;
00322 int c;
00323
00324 x = (const st_str_end_key *)xp;
00325 y = (const st_str_end_key *)yp;
00326 if ((x->end - x->s) != (y->end - y->s))
00327 return 1;
00328
00329 p = x->s;
00330 q = y->s;
00331 while (p < x->end) {
00332 c = (int )*p - (int )*q;
00333 if (c != 0) return c;
00334
00335 p++; q++;
00336 }
00337
00338 return 0;
00339 }
00340
00341 static st_index_t
00342 str_end_hash(st_data_t xp)
00343 {
00344 const st_str_end_key *x = (const st_str_end_key *)xp;
00345 const UChar *p;
00346 st_index_t val = 0;
00347
00348 p = x->s;
00349 while (p < x->end) {
00350 val = val * 997 + (int )*p++;
00351 }
00352
00353 return val + (val >> 5);
00354 }
00355
00356 extern hash_table_type*
00357 onig_st_init_strend_table_with_size(st_index_t size)
00358 {
00359 static const struct st_hash_type hashType = {
00360 str_end_cmp,
00361 str_end_hash,
00362 };
00363
00364 return (hash_table_type* )
00365 onig_st_init_table_with_size(&hashType, size);
00366 }
00367
00368 extern int
00369 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
00370 const UChar* end_key, hash_data_type *value)
00371 {
00372 st_str_end_key key;
00373
00374 key.s = (UChar* )str_key;
00375 key.end = (UChar* )end_key;
00376
00377 return onig_st_lookup(table, (st_data_t )(&key), value);
00378 }
00379
00380 extern int
00381 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
00382 const UChar* end_key, hash_data_type value)
00383 {
00384 st_str_end_key* key;
00385 int result;
00386
00387 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
00388 key->s = (UChar* )str_key;
00389 key->end = (UChar* )end_key;
00390 result = onig_st_insert(table, (st_data_t )key, value);
00391 if (result) {
00392 xfree(key);
00393 }
00394 return result;
00395 }
00396
00397 #endif
00398
00399
00400 #ifdef USE_NAMED_GROUP
00401
00402 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
00403
00404 typedef struct {
00405 UChar* name;
00406 size_t name_len;
00407 int back_num;
00408 int back_alloc;
00409 int back_ref1;
00410 int* back_refs;
00411 } NameEntry;
00412
00413 #ifdef USE_ST_LIBRARY
00414
00415 typedef st_table NameTable;
00416 typedef st_data_t HashDataType;
00417
00418 #define NAMEBUF_SIZE 24
00419 #define NAMEBUF_SIZE_1 25
00420
00421 #ifdef ONIG_DEBUG
00422 static int
00423 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
00424 {
00425 int i;
00426 FILE* fp = (FILE* )arg;
00427
00428 fprintf(fp, "%s: ", e->name);
00429 if (e->back_num == 0)
00430 fputs("-", fp);
00431 else if (e->back_num == 1)
00432 fprintf(fp, "%d", e->back_ref1);
00433 else {
00434 for (i = 0; i < e->back_num; i++) {
00435 if (i > 0) fprintf(fp, ", ");
00436 fprintf(fp, "%d", e->back_refs[i]);
00437 }
00438 }
00439 fputs("\n", fp);
00440 return ST_CONTINUE;
00441 }
00442
00443 extern int
00444 onig_print_names(FILE* fp, regex_t* reg)
00445 {
00446 NameTable* t = (NameTable* )reg->name_table;
00447
00448 if (IS_NOT_NULL(t)) {
00449 fprintf(fp, "name table\n");
00450 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
00451 fputs("\n", fp);
00452 }
00453 return 0;
00454 }
00455 #endif
00456
00457 static int
00458 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
00459 {
00460 xfree(e->name);
00461 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00462 xfree(key);
00463 xfree(e);
00464 return ST_DELETE;
00465 }
00466
00467 static int
00468 names_clear(regex_t* reg)
00469 {
00470 NameTable* t = (NameTable* )reg->name_table;
00471
00472 if (IS_NOT_NULL(t)) {
00473 onig_st_foreach(t, i_free_name_entry, 0);
00474 }
00475 return 0;
00476 }
00477
00478 extern int
00479 onig_names_free(regex_t* reg)
00480 {
00481 int r;
00482 NameTable* t;
00483
00484 r = names_clear(reg);
00485 if (r) return r;
00486
00487 t = (NameTable* )reg->name_table;
00488 if (IS_NOT_NULL(t)) onig_st_free_table(t);
00489 reg->name_table = (void* )NULL;
00490 return 0;
00491 }
00492
00493 static NameEntry*
00494 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
00495 {
00496 NameEntry* e;
00497 NameTable* t = (NameTable* )reg->name_table;
00498
00499 e = (NameEntry* )NULL;
00500 if (IS_NOT_NULL(t)) {
00501 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
00502 }
00503 return e;
00504 }
00505
00506 typedef struct {
00507 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
00508 regex_t* reg;
00509 void* arg;
00510 int ret;
00511 OnigEncoding enc;
00512 } INamesArg;
00513
00514 static int
00515 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
00516 {
00517 int r = (*(arg->func))(e->name,
00518 e->name + e->name_len,
00519 e->back_num,
00520 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00521 arg->reg, arg->arg);
00522 if (r != 0) {
00523 arg->ret = r;
00524 return ST_STOP;
00525 }
00526 return ST_CONTINUE;
00527 }
00528
00529 extern int
00530 onig_foreach_name(regex_t* reg,
00531 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00532 {
00533 INamesArg narg;
00534 NameTable* t = (NameTable* )reg->name_table;
00535
00536 narg.ret = 0;
00537 if (IS_NOT_NULL(t)) {
00538 narg.func = func;
00539 narg.reg = reg;
00540 narg.arg = arg;
00541 narg.enc = reg->enc;
00542 onig_st_foreach(t, i_names, (HashDataType )&narg);
00543 }
00544 return narg.ret;
00545 }
00546
00547 static int
00548 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
00549 {
00550 int i;
00551
00552 if (e->back_num > 1) {
00553 for (i = 0; i < e->back_num; i++) {
00554 e->back_refs[i] = map[e->back_refs[i]].new_val;
00555 }
00556 }
00557 else if (e->back_num == 1) {
00558 e->back_ref1 = map[e->back_ref1].new_val;
00559 }
00560
00561 return ST_CONTINUE;
00562 }
00563
00564 extern int
00565 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
00566 {
00567 NameTable* t = (NameTable* )reg->name_table;
00568
00569 if (IS_NOT_NULL(t)) {
00570 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
00571 }
00572 return 0;
00573 }
00574
00575
00576 extern int
00577 onig_number_of_names(regex_t* reg)
00578 {
00579 NameTable* t = (NameTable* )reg->name_table;
00580
00581 if (IS_NOT_NULL(t))
00582 return (int)t->num_entries;
00583 else
00584 return 0;
00585 }
00586
00587 #else
00588
00589 #define INIT_NAMES_ALLOC_NUM 8
00590
00591 typedef struct {
00592 NameEntry* e;
00593 int num;
00594 int alloc;
00595 } NameTable;
00596
00597 #ifdef ONIG_DEBUG
00598 extern int
00599 onig_print_names(FILE* fp, regex_t* reg)
00600 {
00601 int i, j;
00602 NameEntry* e;
00603 NameTable* t = (NameTable* )reg->name_table;
00604
00605 if (IS_NOT_NULL(t) && t->num > 0) {
00606 fprintf(fp, "name table\n");
00607 for (i = 0; i < t->num; i++) {
00608 e = &(t->e[i]);
00609 fprintf(fp, "%s: ", e->name);
00610 if (e->back_num == 0) {
00611 fputs("-", fp);
00612 }
00613 else if (e->back_num == 1) {
00614 fprintf(fp, "%d", e->back_ref1);
00615 }
00616 else {
00617 for (j = 0; j < e->back_num; j++) {
00618 if (j > 0) fprintf(fp, ", ");
00619 fprintf(fp, "%d", e->back_refs[j]);
00620 }
00621 }
00622 fputs("\n", fp);
00623 }
00624 fputs("\n", fp);
00625 }
00626 return 0;
00627 }
00628 #endif
00629
00630 static int
00631 names_clear(regex_t* reg)
00632 {
00633 int i;
00634 NameEntry* e;
00635 NameTable* t = (NameTable* )reg->name_table;
00636
00637 if (IS_NOT_NULL(t)) {
00638 for (i = 0; i < t->num; i++) {
00639 e = &(t->e[i]);
00640 if (IS_NOT_NULL(e->name)) {
00641 xfree(e->name);
00642 e->name = NULL;
00643 e->name_len = 0;
00644 e->back_num = 0;
00645 e->back_alloc = 0;
00646 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
00647 e->back_refs = (int* )NULL;
00648 }
00649 }
00650 if (IS_NOT_NULL(t->e)) {
00651 xfree(t->e);
00652 t->e = NULL;
00653 }
00654 t->num = 0;
00655 }
00656 return 0;
00657 }
00658
00659 extern int
00660 onig_names_free(regex_t* reg)
00661 {
00662 int r;
00663 NameTable* t;
00664
00665 r = names_clear(reg);
00666 if (r) return r;
00667
00668 t = (NameTable* )reg->name_table;
00669 if (IS_NOT_NULL(t)) xfree(t);
00670 reg->name_table = NULL;
00671 return 0;
00672 }
00673
00674 static NameEntry*
00675 name_find(regex_t* reg, UChar* name, UChar* name_end)
00676 {
00677 int i, len;
00678 NameEntry* e;
00679 NameTable* t = (NameTable* )reg->name_table;
00680
00681 if (IS_NOT_NULL(t)) {
00682 len = name_end - name;
00683 for (i = 0; i < t->num; i++) {
00684 e = &(t->e[i]);
00685 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
00686 return e;
00687 }
00688 }
00689 return (NameEntry* )NULL;
00690 }
00691
00692 extern int
00693 onig_foreach_name(regex_t* reg,
00694 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00695 {
00696 int i, r;
00697 NameEntry* e;
00698 NameTable* t = (NameTable* )reg->name_table;
00699
00700 if (IS_NOT_NULL(t)) {
00701 for (i = 0; i < t->num; i++) {
00702 e = &(t->e[i]);
00703 r = (*func)(e->name, e->name + e->name_len, e->back_num,
00704 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
00705 reg, arg);
00706 if (r != 0) return r;
00707 }
00708 }
00709 return 0;
00710 }
00711
00712 extern int
00713 onig_number_of_names(regex_t* reg)
00714 {
00715 NameTable* t = (NameTable* )reg->name_table;
00716
00717 if (IS_NOT_NULL(t))
00718 return t->num;
00719 else
00720 return 0;
00721 }
00722
00723 #endif
00724
00725 static int
00726 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
00727 {
00728 int alloc;
00729 NameEntry* e;
00730 NameTable* t = (NameTable* )reg->name_table;
00731
00732 if (name_end - name <= 0)
00733 return ONIGERR_EMPTY_GROUP_NAME;
00734
00735 e = name_find(reg, name, name_end);
00736 if (IS_NULL(e)) {
00737 #ifdef USE_ST_LIBRARY
00738 if (IS_NULL(t)) {
00739 t = onig_st_init_strend_table_with_size(5);
00740 reg->name_table = (void* )t;
00741 }
00742 e = (NameEntry* )xmalloc(sizeof(NameEntry));
00743 CHECK_NULL_RETURN_MEMERR(e);
00744
00745 e->name = strdup_with_null(reg->enc, name, name_end);
00746 if (IS_NULL(e->name)) {
00747 xfree(e);
00748 return ONIGERR_MEMORY;
00749 }
00750 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
00751 (HashDataType )e);
00752
00753 e->name_len = name_end - name;
00754 e->back_num = 0;
00755 e->back_alloc = 0;
00756 e->back_refs = (int* )NULL;
00757
00758 #else
00759
00760 if (IS_NULL(t)) {
00761 alloc = INIT_NAMES_ALLOC_NUM;
00762 t = (NameTable* )xmalloc(sizeof(NameTable));
00763 CHECK_NULL_RETURN_MEMERR(t);
00764 t->e = NULL;
00765 t->alloc = 0;
00766 t->num = 0;
00767
00768 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
00769 if (IS_NULL(t->e)) {
00770 xfree(t);
00771 return ONIGERR_MEMORY;
00772 }
00773 t->alloc = alloc;
00774 reg->name_table = t;
00775 goto clear;
00776 }
00777 else if (t->num == t->alloc) {
00778 int i;
00779
00780 alloc = t->alloc * 2;
00781 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
00782 CHECK_NULL_RETURN_MEMERR(t->e);
00783 t->alloc = alloc;
00784
00785 clear:
00786 for (i = t->num; i < t->alloc; i++) {
00787 t->e[i].name = NULL;
00788 t->e[i].name_len = 0;
00789 t->e[i].back_num = 0;
00790 t->e[i].back_alloc = 0;
00791 t->e[i].back_refs = (int* )NULL;
00792 }
00793 }
00794 e = &(t->e[t->num]);
00795 t->num++;
00796 e->name = strdup_with_null(reg->enc, name, name_end);
00797 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
00798 e->name_len = name_end - name;
00799 #endif
00800 }
00801
00802 if (e->back_num >= 1 &&
00803 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
00804 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
00805 name, name_end);
00806 return ONIGERR_MULTIPLEX_DEFINED_NAME;
00807 }
00808
00809 e->back_num++;
00810 if (e->back_num == 1) {
00811 e->back_ref1 = backref;
00812 }
00813 else {
00814 if (e->back_num == 2) {
00815 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
00816 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
00817 CHECK_NULL_RETURN_MEMERR(e->back_refs);
00818 e->back_alloc = alloc;
00819 e->back_refs[0] = e->back_ref1;
00820 e->back_refs[1] = backref;
00821 }
00822 else {
00823 if (e->back_num > e->back_alloc) {
00824 alloc = e->back_alloc * 2;
00825 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
00826 CHECK_NULL_RETURN_MEMERR(e->back_refs);
00827 e->back_alloc = alloc;
00828 }
00829 e->back_refs[e->back_num - 1] = backref;
00830 }
00831 }
00832
00833 return 0;
00834 }
00835
00836 extern int
00837 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00838 const UChar* name_end, int** nums)
00839 {
00840 NameEntry* e = name_find(reg, name, name_end);
00841
00842 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
00843
00844 switch (e->back_num) {
00845 case 0:
00846 *nums = 0;
00847 break;
00848 case 1:
00849 *nums = &(e->back_ref1);
00850 break;
00851 default:
00852 *nums = e->back_refs;
00853 break;
00854 }
00855 return e->back_num;
00856 }
00857
00858 extern int
00859 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00860 const UChar* name_end, OnigRegion *region)
00861 {
00862 int i, n, *nums;
00863
00864 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
00865 if (n < 0)
00866 return n;
00867 else if (n == 0)
00868 return ONIGERR_PARSER_BUG;
00869 else if (n == 1)
00870 return nums[0];
00871 else {
00872 if (IS_NOT_NULL(region)) {
00873 for (i = n - 1; i >= 0; i--) {
00874 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
00875 return nums[i];
00876 }
00877 }
00878 return nums[n - 1];
00879 }
00880 }
00881
00882 #else
00883
00884 extern int
00885 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
00886 const UChar* name_end, int** nums)
00887 {
00888 return ONIG_NO_SUPPORT_CONFIG;
00889 }
00890
00891 extern int
00892 onig_name_to_backref_number(regex_t* reg, const UChar* name,
00893 const UChar* name_end, OnigRegion* region)
00894 {
00895 return ONIG_NO_SUPPORT_CONFIG;
00896 }
00897
00898 extern int
00899 onig_foreach_name(regex_t* reg,
00900 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
00901 {
00902 return ONIG_NO_SUPPORT_CONFIG;
00903 }
00904
00905 extern int
00906 onig_number_of_names(regex_t* reg)
00907 {
00908 return 0;
00909 }
00910 #endif
00911
00912 extern int
00913 onig_noname_group_capture_is_active(regex_t* reg)
00914 {
00915 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
00916 return 0;
00917
00918 #ifdef USE_NAMED_GROUP
00919 if (onig_number_of_names(reg) > 0 &&
00920 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
00921 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
00922 return 0;
00923 }
00924 #endif
00925
00926 return 1;
00927 }
00928
00929
00930 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
00931
00932 static void
00933 scan_env_clear(ScanEnv* env)
00934 {
00935 int i;
00936
00937 BIT_STATUS_CLEAR(env->capture_history);
00938 BIT_STATUS_CLEAR(env->bt_mem_start);
00939 BIT_STATUS_CLEAR(env->bt_mem_end);
00940 BIT_STATUS_CLEAR(env->backrefed_mem);
00941 env->error = (UChar* )NULL;
00942 env->error_end = (UChar* )NULL;
00943 env->num_call = 0;
00944 env->num_mem = 0;
00945 #ifdef USE_NAMED_GROUP
00946 env->num_named = 0;
00947 #endif
00948 env->mem_alloc = 0;
00949 env->mem_nodes_dynamic = (Node** )NULL;
00950
00951 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
00952 env->mem_nodes_static[i] = NULL_NODE;
00953
00954 #ifdef USE_COMBINATION_EXPLOSION_CHECK
00955 env->num_comb_exp_check = 0;
00956 env->comb_exp_max_regnum = 0;
00957 env->curr_max_regnum = 0;
00958 env->has_recursion = 0;
00959 #endif
00960 env->warnings_flag = 0;
00961 }
00962
00963 static int
00964 scan_env_add_mem_entry(ScanEnv* env)
00965 {
00966 int i, need, alloc;
00967 Node** p;
00968
00969 need = env->num_mem + 1;
00970 if (need >= SCANENV_MEMNODES_SIZE) {
00971 if (env->mem_alloc <= need) {
00972 if (IS_NULL(env->mem_nodes_dynamic)) {
00973 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
00974 p = (Node** )xmalloc(sizeof(Node*) * alloc);
00975 xmemcpy(p, env->mem_nodes_static,
00976 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
00977 }
00978 else {
00979 alloc = env->mem_alloc * 2;
00980 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
00981 }
00982 CHECK_NULL_RETURN_MEMERR(p);
00983
00984 for (i = env->num_mem + 1; i < alloc; i++)
00985 p[i] = NULL_NODE;
00986
00987 env->mem_nodes_dynamic = p;
00988 env->mem_alloc = alloc;
00989 }
00990 }
00991
00992 env->num_mem++;
00993 return env->num_mem;
00994 }
00995
00996 static int
00997 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
00998 {
00999 if (env->num_mem >= num)
01000 SCANENV_MEM_NODES(env)[num] = node;
01001 else
01002 return ONIGERR_PARSER_BUG;
01003 return 0;
01004 }
01005
01006
01007 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01008 typedef struct _FreeNode {
01009 struct _FreeNode* next;
01010 } FreeNode;
01011
01012 static FreeNode* FreeNodeList = (FreeNode* )NULL;
01013 #endif
01014
01015 extern void
01016 onig_node_free(Node* node)
01017 {
01018 start:
01019 if (IS_NULL(node)) return ;
01020
01021 switch (NTYPE(node)) {
01022 case NT_STR:
01023 if (NSTR(node)->capa != 0 &&
01024 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01025 xfree(NSTR(node)->s);
01026 }
01027 break;
01028
01029 case NT_LIST:
01030 case NT_ALT:
01031 onig_node_free(NCAR(node));
01032 {
01033 Node* next_node = NCDR(node);
01034
01035 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01036 {
01037 FreeNode* n = (FreeNode* )node;
01038
01039 THREAD_ATOMIC_START;
01040 n->next = FreeNodeList;
01041 FreeNodeList = n;
01042 THREAD_ATOMIC_END;
01043 }
01044 #else
01045 xfree(node);
01046 #endif
01047 node = next_node;
01048 goto start;
01049 }
01050 break;
01051
01052 case NT_CCLASS:
01053 {
01054 CClassNode* cc = NCCLASS(node);
01055
01056 if (IS_NCCLASS_SHARE(cc)) return ;
01057 if (cc->mbuf)
01058 bbuf_free(cc->mbuf);
01059 }
01060 break;
01061
01062 case NT_QTFR:
01063 if (NQTFR(node)->target)
01064 onig_node_free(NQTFR(node)->target);
01065 break;
01066
01067 case NT_ENCLOSE:
01068 if (NENCLOSE(node)->target)
01069 onig_node_free(NENCLOSE(node)->target);
01070 break;
01071
01072 case NT_BREF:
01073 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
01074 xfree(NBREF(node)->back_dynamic);
01075 break;
01076
01077 case NT_ANCHOR:
01078 if (NANCHOR(node)->target)
01079 onig_node_free(NANCHOR(node)->target);
01080 break;
01081 }
01082
01083 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01084 {
01085 FreeNode* n = (FreeNode* )node;
01086
01087 THREAD_ATOMIC_START;
01088 n->next = FreeNodeList;
01089 FreeNodeList = n;
01090 THREAD_ATOMIC_END;
01091 }
01092 #else
01093 xfree(node);
01094 #endif
01095 }
01096
01097 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01098 extern int
01099 onig_free_node_list(void)
01100 {
01101 FreeNode* n;
01102
01103
01104 while (IS_NOT_NULL(FreeNodeList)) {
01105 n = FreeNodeList;
01106 FreeNodeList = FreeNodeList->next;
01107 xfree(n);
01108 }
01109
01110 return 0;
01111 }
01112 #endif
01113
01114 static Node*
01115 node_new(void)
01116 {
01117 Node* node;
01118
01119 #ifdef USE_PARSE_TREE_NODE_RECYCLE
01120 THREAD_ATOMIC_START;
01121 if (IS_NOT_NULL(FreeNodeList)) {
01122 node = (Node* )FreeNodeList;
01123 FreeNodeList = FreeNodeList->next;
01124 THREAD_ATOMIC_END;
01125 return node;
01126 }
01127 THREAD_ATOMIC_END;
01128 #endif
01129
01130 node = (Node* )xmalloc(sizeof(Node));
01131
01132 return node;
01133 }
01134
01135
01136 static void
01137 initialize_cclass(CClassNode* cc)
01138 {
01139 BITSET_CLEAR(cc->bs);
01140
01141 cc->flags = 0;
01142 cc->mbuf = NULL;
01143 }
01144
01145 static Node*
01146 node_new_cclass(void)
01147 {
01148 Node* node = node_new();
01149 CHECK_NULL_RETURN(node);
01150
01151 SET_NTYPE(node, NT_CCLASS);
01152 initialize_cclass(NCCLASS(node));
01153 return node;
01154 }
01155
01156 static Node*
01157 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
01158 const OnigCodePoint ranges[])
01159 {
01160 int n, i;
01161 CClassNode* cc;
01162 OnigCodePoint j;
01163
01164 Node* node = node_new_cclass();
01165 CHECK_NULL_RETURN(node);
01166
01167 cc = NCCLASS(node);
01168 if (not != 0) NCCLASS_SET_NOT(cc);
01169
01170 BITSET_CLEAR(cc->bs);
01171 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
01172 n = ONIGENC_CODE_RANGE_NUM(ranges);
01173 for (i = 0; i < n; i++) {
01174 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
01175 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
01176 if (j >= sb_out) goto sb_end;
01177
01178 BITSET_SET_BIT(cc->bs, j);
01179 }
01180 }
01181 }
01182
01183 sb_end:
01184 if (IS_NULL(ranges)) {
01185 is_null:
01186 cc->mbuf = NULL;
01187 }
01188 else {
01189 BBuf* bbuf;
01190
01191 n = ONIGENC_CODE_RANGE_NUM(ranges);
01192 if (n == 0) goto is_null;
01193
01194 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
01195 CHECK_NULL_RETURN(bbuf);
01196 bbuf->alloc = n + 1;
01197 bbuf->used = n + 1;
01198 bbuf->p = (UChar* )((void* )ranges);
01199
01200 cc->mbuf = bbuf;
01201 }
01202
01203 return node;
01204 }
01205
01206 static Node*
01207 node_new_ctype(int type, int not)
01208 {
01209 Node* node = node_new();
01210 CHECK_NULL_RETURN(node);
01211
01212 SET_NTYPE(node, NT_CTYPE);
01213 NCTYPE(node)->ctype = type;
01214 NCTYPE(node)->not = not;
01215 return node;
01216 }
01217
01218 static Node*
01219 node_new_anychar(void)
01220 {
01221 Node* node = node_new();
01222 CHECK_NULL_RETURN(node);
01223
01224 SET_NTYPE(node, NT_CANY);
01225 return node;
01226 }
01227
01228 static Node*
01229 node_new_list(Node* left, Node* right)
01230 {
01231 Node* node = node_new();
01232 CHECK_NULL_RETURN(node);
01233
01234 SET_NTYPE(node, NT_LIST);
01235 NCAR(node) = left;
01236 NCDR(node) = right;
01237 return node;
01238 }
01239
01240 extern Node*
01241 onig_node_new_list(Node* left, Node* right)
01242 {
01243 return node_new_list(left, right);
01244 }
01245
01246 extern Node*
01247 onig_node_list_add(Node* list, Node* x)
01248 {
01249 Node *n;
01250
01251 n = onig_node_new_list(x, NULL);
01252 if (IS_NULL(n)) return NULL_NODE;
01253
01254 if (IS_NOT_NULL(list)) {
01255 while (IS_NOT_NULL(NCDR(list)))
01256 list = NCDR(list);
01257
01258 NCDR(list) = n;
01259 }
01260
01261 return n;
01262 }
01263
01264 extern Node*
01265 onig_node_new_alt(Node* left, Node* right)
01266 {
01267 Node* node = node_new();
01268 CHECK_NULL_RETURN(node);
01269
01270 SET_NTYPE(node, NT_ALT);
01271 NCAR(node) = left;
01272 NCDR(node) = right;
01273 return node;
01274 }
01275
01276 extern Node*
01277 onig_node_new_anchor(int type)
01278 {
01279 Node* node = node_new();
01280 CHECK_NULL_RETURN(node);
01281
01282 SET_NTYPE(node, NT_ANCHOR);
01283 NANCHOR(node)->type = type;
01284 NANCHOR(node)->target = NULL;
01285 NANCHOR(node)->char_len = -1;
01286 return node;
01287 }
01288
01289 static Node*
01290 node_new_backref(int back_num, int* backrefs, int by_name,
01291 #ifdef USE_BACKREF_WITH_LEVEL
01292 int exist_level, int nest_level,
01293 #endif
01294 ScanEnv* env)
01295 {
01296 int i;
01297 Node* node = node_new();
01298
01299 CHECK_NULL_RETURN(node);
01300
01301 SET_NTYPE(node, NT_BREF);
01302 NBREF(node)->state = 0;
01303 NBREF(node)->back_num = back_num;
01304 NBREF(node)->back_dynamic = (int* )NULL;
01305 if (by_name != 0)
01306 NBREF(node)->state |= NST_NAME_REF;
01307
01308 #ifdef USE_BACKREF_WITH_LEVEL
01309 if (exist_level != 0) {
01310 NBREF(node)->state |= NST_NEST_LEVEL;
01311 NBREF(node)->nest_level = nest_level;
01312 }
01313 #endif
01314
01315 for (i = 0; i < back_num; i++) {
01316 if (backrefs[i] <= env->num_mem &&
01317 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
01318 NBREF(node)->state |= NST_RECURSION;
01319 break;
01320 }
01321 }
01322
01323 if (back_num <= NODE_BACKREFS_SIZE) {
01324 for (i = 0; i < back_num; i++)
01325 NBREF(node)->back_static[i] = backrefs[i];
01326 }
01327 else {
01328 int* p = (int* )xmalloc(sizeof(int) * back_num);
01329 if (IS_NULL(p)) {
01330 onig_node_free(node);
01331 return NULL;
01332 }
01333 NBREF(node)->back_dynamic = p;
01334 for (i = 0; i < back_num; i++)
01335 p[i] = backrefs[i];
01336 }
01337 return node;
01338 }
01339
01340 #ifdef USE_SUBEXP_CALL
01341 static Node*
01342 node_new_call(UChar* name, UChar* name_end, int gnum)
01343 {
01344 Node* node = node_new();
01345 CHECK_NULL_RETURN(node);
01346
01347 SET_NTYPE(node, NT_CALL);
01348 NCALL(node)->state = 0;
01349 NCALL(node)->target = NULL_NODE;
01350 NCALL(node)->name = name;
01351 NCALL(node)->name_end = name_end;
01352 NCALL(node)->group_num = gnum;
01353 return node;
01354 }
01355 #endif
01356
01357 static Node*
01358 node_new_quantifier(int lower, int upper, int by_number)
01359 {
01360 Node* node = node_new();
01361 CHECK_NULL_RETURN(node);
01362
01363 SET_NTYPE(node, NT_QTFR);
01364 NQTFR(node)->state = 0;
01365 NQTFR(node)->target = NULL;
01366 NQTFR(node)->lower = lower;
01367 NQTFR(node)->upper = upper;
01368 NQTFR(node)->greedy = 1;
01369 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
01370 NQTFR(node)->head_exact = NULL_NODE;
01371 NQTFR(node)->next_head_exact = NULL_NODE;
01372 NQTFR(node)->is_refered = 0;
01373 if (by_number != 0)
01374 NQTFR(node)->state |= NST_BY_NUMBER;
01375
01376 #ifdef USE_COMBINATION_EXPLOSION_CHECK
01377 NQTFR(node)->comb_exp_check_num = 0;
01378 #endif
01379
01380 return node;
01381 }
01382
01383 static Node*
01384 node_new_enclose(int type)
01385 {
01386 Node* node = node_new();
01387 CHECK_NULL_RETURN(node);
01388
01389 SET_NTYPE(node, NT_ENCLOSE);
01390 NENCLOSE(node)->type = type;
01391 NENCLOSE(node)->state = 0;
01392 NENCLOSE(node)->regnum = 0;
01393 NENCLOSE(node)->option = 0;
01394 NENCLOSE(node)->target = NULL;
01395 NENCLOSE(node)->call_addr = -1;
01396 NENCLOSE(node)->opt_count = 0;
01397 return node;
01398 }
01399
01400 extern Node*
01401 onig_node_new_enclose(int type)
01402 {
01403 return node_new_enclose(type);
01404 }
01405
01406 static Node*
01407 node_new_enclose_memory(OnigOptionType option, int is_named)
01408 {
01409 Node* node = node_new_enclose(ENCLOSE_MEMORY);
01410 CHECK_NULL_RETURN(node);
01411 if (is_named != 0)
01412 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
01413
01414 #ifdef USE_SUBEXP_CALL
01415 NENCLOSE(node)->option = option;
01416 #endif
01417 return node;
01418 }
01419
01420 static Node*
01421 node_new_option(OnigOptionType option)
01422 {
01423 Node* node = node_new_enclose(ENCLOSE_OPTION);
01424 CHECK_NULL_RETURN(node);
01425 NENCLOSE(node)->option = option;
01426 return node;
01427 }
01428
01429 extern int
01430 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
01431 {
01432 ptrdiff_t addlen = end - s;
01433
01434 if (addlen > 0) {
01435 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
01436
01437 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
01438 UChar* p;
01439 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
01440
01441 if (capa <= NSTR(node)->capa) {
01442 onig_strcpy(NSTR(node)->s + len, s, end);
01443 }
01444 else {
01445 if (NSTR(node)->s == NSTR(node)->buf)
01446 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
01447 s, end, capa);
01448 else
01449 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
01450
01451 CHECK_NULL_RETURN_MEMERR(p);
01452 NSTR(node)->s = p;
01453 NSTR(node)->capa = (int)capa;
01454 }
01455 }
01456 else {
01457 onig_strcpy(NSTR(node)->s + len, s, end);
01458 }
01459 NSTR(node)->end = NSTR(node)->s + len + addlen;
01460 }
01461
01462 return 0;
01463 }
01464
01465 extern int
01466 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
01467 {
01468 onig_node_str_clear(node);
01469 return onig_node_str_cat(node, s, end);
01470 }
01471
01472 static int
01473 node_str_cat_char(Node* node, UChar c)
01474 {
01475 UChar s[1];
01476
01477 s[0] = c;
01478 return onig_node_str_cat(node, s, s + 1);
01479 }
01480
01481 extern void
01482 onig_node_conv_to_str_node(Node* node, int flag)
01483 {
01484 SET_NTYPE(node, NT_STR);
01485 NSTR(node)->flag = flag;
01486 NSTR(node)->capa = 0;
01487 NSTR(node)->s = NSTR(node)->buf;
01488 NSTR(node)->end = NSTR(node)->buf;
01489 }
01490
01491 extern void
01492 onig_node_str_clear(Node* node)
01493 {
01494 if (NSTR(node)->capa != 0 &&
01495 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
01496 xfree(NSTR(node)->s);
01497 }
01498
01499 NSTR(node)->capa = 0;
01500 NSTR(node)->flag = 0;
01501 NSTR(node)->s = NSTR(node)->buf;
01502 NSTR(node)->end = NSTR(node)->buf;
01503 }
01504
01505 static Node*
01506 node_new_str(const UChar* s, const UChar* end)
01507 {
01508 Node* node = node_new();
01509 CHECK_NULL_RETURN(node);
01510
01511 SET_NTYPE(node, NT_STR);
01512 NSTR(node)->capa = 0;
01513 NSTR(node)->flag = 0;
01514 NSTR(node)->s = NSTR(node)->buf;
01515 NSTR(node)->end = NSTR(node)->buf;
01516 if (onig_node_str_cat(node, s, end)) {
01517 onig_node_free(node);
01518 return NULL;
01519 }
01520 return node;
01521 }
01522
01523 extern Node*
01524 onig_node_new_str(const UChar* s, const UChar* end)
01525 {
01526 return node_new_str(s, end);
01527 }
01528
01529 static Node*
01530 node_new_str_raw(UChar* s, UChar* end)
01531 {
01532 Node* node = node_new_str(s, end);
01533 NSTRING_SET_RAW(node);
01534 return node;
01535 }
01536
01537 static Node*
01538 node_new_empty(void)
01539 {
01540 return node_new_str(NULL, NULL);
01541 }
01542
01543 static Node*
01544 node_new_str_raw_char(UChar c)
01545 {
01546 UChar p[1];
01547
01548 p[0] = c;
01549 return node_new_str_raw(p, p + 1);
01550 }
01551
01552 static Node*
01553 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
01554 {
01555 const UChar *p;
01556 Node* n = NULL_NODE;
01557
01558 if (sn->end > sn->s) {
01559 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
01560 if (p && p > sn->s) {
01561 n = node_new_str(p, sn->end);
01562 if ((sn->flag & NSTR_RAW) != 0)
01563 NSTRING_SET_RAW(n);
01564 sn->end = (UChar* )p;
01565 }
01566 }
01567 return n;
01568 }
01569
01570 static int
01571 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
01572 {
01573 if (sn->end > sn->s) {
01574 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
01575 }
01576 return 0;
01577 }
01578
01579 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
01580 static int
01581 node_str_head_pad(StrNode* sn, int num, UChar val)
01582 {
01583 UChar buf[NODE_STR_BUF_SIZE];
01584 int i, len;
01585
01586 len = sn->end - sn->s;
01587 onig_strcpy(buf, sn->s, sn->end);
01588 onig_strcpy(&(sn->s[num]), buf, buf + len);
01589 sn->end += num;
01590
01591 for (i = 0; i < num; i++) {
01592 sn->s[i] = val;
01593 }
01594 }
01595 #endif
01596
01597 extern int
01598 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
01599 {
01600 unsigned int num, val;
01601 OnigCodePoint c;
01602 UChar* p = *src;
01603 PFETCH_READY;
01604
01605 num = 0;
01606 while (!PEND) {
01607 PFETCH(c);
01608 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
01609 val = (unsigned int )DIGITVAL(c);
01610 if ((INT_MAX_LIMIT - val) / 10UL < num)
01611 return -1;
01612
01613 num = num * 10 + val;
01614 }
01615 else {
01616 PUNFETCH;
01617 break;
01618 }
01619 }
01620 *src = p;
01621 return num;
01622 }
01623
01624 static int
01625 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
01626 OnigEncoding enc)
01627 {
01628 OnigCodePoint c;
01629 unsigned int num, val;
01630 UChar* p = *src;
01631 PFETCH_READY;
01632
01633 num = 0;
01634 while (!PEND && maxlen-- != 0) {
01635 PFETCH(c);
01636 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
01637 val = (unsigned int )XDIGITVAL(enc,c);
01638 if ((INT_MAX_LIMIT - val) / 16UL < num)
01639 return -1;
01640
01641 num = (num << 4) + XDIGITVAL(enc,c);
01642 }
01643 else {
01644 PUNFETCH;
01645 break;
01646 }
01647 }
01648 *src = p;
01649 return num;
01650 }
01651
01652 static int
01653 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
01654 OnigEncoding enc)
01655 {
01656 OnigCodePoint c;
01657 unsigned int num, val;
01658 UChar* p = *src;
01659 PFETCH_READY;
01660
01661 num = 0;
01662 while (!PEND && maxlen-- != 0) {
01663 PFETCH(c);
01664 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
01665 val = ODIGITVAL(c);
01666 if ((INT_MAX_LIMIT - val) / 8UL < num)
01667 return -1;
01668
01669 num = (num << 3) + val;
01670 }
01671 else {
01672 PUNFETCH;
01673 break;
01674 }
01675 }
01676 *src = p;
01677 return num;
01678 }
01679
01680
01681 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
01682 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
01683
01684
01685
01686
01687
01688 static int
01689 new_code_range(BBuf** pbuf)
01690 {
01691 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
01692 int r;
01693 OnigCodePoint n;
01694 BBuf* bbuf;
01695
01696 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
01697 CHECK_NULL_RETURN_MEMERR(*pbuf);
01698 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
01699 if (r) return r;
01700
01701 n = 0;
01702 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01703 return 0;
01704 }
01705
01706 static int
01707 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
01708 int checkdup)
01709 {
01710 int r, inc_n, pos;
01711 int low, high, bound, x;
01712 OnigCodePoint n, *data;
01713 BBuf* bbuf;
01714
01715 if (from > to) {
01716 n = from; from = to; to = n;
01717 }
01718
01719 if (IS_NULL(*pbuf)) {
01720 r = new_code_range(pbuf);
01721 if (r) return r;
01722 bbuf = *pbuf;
01723 n = 0;
01724 }
01725 else {
01726 bbuf = *pbuf;
01727 GET_CODE_POINT(n, bbuf->p);
01728 }
01729 data = (OnigCodePoint* )(bbuf->p);
01730 data++;
01731
01732 for (low = 0, bound = n; low < bound; ) {
01733 x = (low + bound) >> 1;
01734 if (from > data[x*2 + 1])
01735 low = x + 1;
01736 else
01737 bound = x;
01738 }
01739
01740 for (high = low, bound = n; high < bound; ) {
01741 x = (high + bound) >> 1;
01742 if (to >= data[x*2] - 1)
01743 high = x + 1;
01744 else
01745 bound = x;
01746 }
01747
01748
01749
01750
01751 inc_n = low + 1 - high;
01752 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
01753 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
01754
01755 if (inc_n != 1) {
01756 if (checkdup && from <= data[low*2+1]
01757 && (data[low*2] <= from || data[low*2+1] <= to))
01758 CC_DUP_WARN(env);
01759 if (from > data[low*2])
01760 from = data[low*2];
01761 if (to < data[(high - 1)*2 + 1])
01762 to = data[(high - 1)*2 + 1];
01763 }
01764
01765 if (inc_n != 0 && (OnigCodePoint )high < n) {
01766 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
01767 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
01768 int size = (n - high) * 2 * SIZE_CODE_POINT;
01769
01770 if (inc_n > 0) {
01771 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
01772 }
01773 else {
01774 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
01775 }
01776 }
01777
01778 pos = SIZE_CODE_POINT * (1 + low * 2);
01779 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
01780 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
01781 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
01782 n += inc_n;
01783 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
01784
01785 return 0;
01786 }
01787
01788 static int
01789 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01790 {
01791 return add_code_range_to_buf0(pbuf, env, from, to, 1);
01792 }
01793
01794 static int
01795 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
01796 {
01797 if (from > to) {
01798 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
01799 return 0;
01800 else
01801 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
01802 }
01803
01804 return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
01805 }
01806
01807 static int
01808 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
01809 {
01810 return add_code_range0(pbuf, env, from, to, 1);
01811 }
01812
01813 static int
01814 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
01815 {
01816 int r, i, n;
01817 OnigCodePoint pre, from, *data, to = 0;
01818
01819 *pbuf = (BBuf* )NULL;
01820 if (IS_NULL(bbuf)) {
01821 set_all:
01822 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01823 }
01824
01825 data = (OnigCodePoint* )(bbuf->p);
01826 GET_CODE_POINT(n, data);
01827 data++;
01828 if (n <= 0) goto set_all;
01829
01830 r = 0;
01831 pre = MBCODE_START_POS(enc);
01832 for (i = 0; i < n; i++) {
01833 from = data[i*2];
01834 to = data[i*2+1];
01835 if (pre <= from - 1) {
01836 r = add_code_range_to_buf(pbuf, env, pre, from - 1);
01837 if (r != 0) return r;
01838 }
01839 if (to == ~((OnigCodePoint )0)) break;
01840 pre = to + 1;
01841 }
01842 if (to < ~((OnigCodePoint )0)) {
01843 r = add_code_range_to_buf(pbuf, env, to + 1, ~((OnigCodePoint )0));
01844 }
01845 return r;
01846 }
01847
01848 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
01849 BBuf *tbuf; \
01850 int tnot; \
01851 tnot = not1; not1 = not2; not2 = tnot; \
01852 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
01853 } while (0)
01854
01855 static int
01856 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
01857 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01858 {
01859 int r;
01860 OnigCodePoint i, n1, *data1;
01861 OnigCodePoint from, to;
01862
01863 *pbuf = (BBuf* )NULL;
01864 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
01865 if (not1 != 0 || not2 != 0)
01866 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01867 return 0;
01868 }
01869
01870 r = 0;
01871 if (IS_NULL(bbuf2))
01872 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01873
01874 if (IS_NULL(bbuf1)) {
01875 if (not1 != 0) {
01876 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
01877 }
01878 else {
01879 if (not2 == 0) {
01880 return bbuf_clone(pbuf, bbuf2);
01881 }
01882 else {
01883 return not_code_range_buf(enc, bbuf2, pbuf, env);
01884 }
01885 }
01886 }
01887
01888 if (not1 != 0)
01889 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01890
01891 data1 = (OnigCodePoint* )(bbuf1->p);
01892 GET_CODE_POINT(n1, data1);
01893 data1++;
01894
01895 if (not2 == 0 && not1 == 0) {
01896 r = bbuf_clone(pbuf, bbuf2);
01897 }
01898 else if (not1 == 0) {
01899 r = not_code_range_buf(enc, bbuf2, pbuf, env);
01900 }
01901 if (r != 0) return r;
01902
01903 for (i = 0; i < n1; i++) {
01904 from = data1[i*2];
01905 to = data1[i*2+1];
01906 r = add_code_range_to_buf(pbuf, env, from, to);
01907 if (r != 0) return r;
01908 }
01909 return 0;
01910 }
01911
01912 static int
01913 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
01914 OnigCodePoint* data, int n)
01915 {
01916 int i, r;
01917 OnigCodePoint from2, to2;
01918
01919 for (i = 0; i < n; i++) {
01920 from2 = data[i*2];
01921 to2 = data[i*2+1];
01922 if (from2 < from1) {
01923 if (to2 < from1) continue;
01924 else {
01925 from1 = to2 + 1;
01926 }
01927 }
01928 else if (from2 <= to1) {
01929 if (to2 < to1) {
01930 if (from1 <= from2 - 1) {
01931 r = add_code_range_to_buf(pbuf, env, from1, from2-1);
01932 if (r != 0) return r;
01933 }
01934 from1 = to2 + 1;
01935 }
01936 else {
01937 to1 = from2 - 1;
01938 }
01939 }
01940 else {
01941 from1 = from2;
01942 }
01943 if (from1 > to1) break;
01944 }
01945 if (from1 <= to1) {
01946 r = add_code_range_to_buf(pbuf, env, from1, to1);
01947 if (r != 0) return r;
01948 }
01949 return 0;
01950 }
01951
01952 static int
01953 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
01954 {
01955 int r;
01956 OnigCodePoint i, j, n1, n2, *data1, *data2;
01957 OnigCodePoint from, to, from1, to1, from2, to2;
01958
01959 *pbuf = (BBuf* )NULL;
01960 if (IS_NULL(bbuf1)) {
01961 if (not1 != 0 && IS_NOT_NULL(bbuf2))
01962 return bbuf_clone(pbuf, bbuf2);
01963 return 0;
01964 }
01965 else if (IS_NULL(bbuf2)) {
01966 if (not2 != 0)
01967 return bbuf_clone(pbuf, bbuf1);
01968 return 0;
01969 }
01970
01971 if (not1 != 0)
01972 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
01973
01974 data1 = (OnigCodePoint* )(bbuf1->p);
01975 data2 = (OnigCodePoint* )(bbuf2->p);
01976 GET_CODE_POINT(n1, data1);
01977 GET_CODE_POINT(n2, data2);
01978 data1++;
01979 data2++;
01980
01981 if (not2 == 0 && not1 == 0) {
01982 for (i = 0; i < n1; i++) {
01983 from1 = data1[i*2];
01984 to1 = data1[i*2+1];
01985 for (j = 0; j < n2; j++) {
01986 from2 = data2[j*2];
01987 to2 = data2[j*2+1];
01988 if (from2 > to1) break;
01989 if (to2 < from1) continue;
01990 from = MAX(from1, from2);
01991 to = MIN(to1, to2);
01992 r = add_code_range_to_buf(pbuf, env, from, to);
01993 if (r != 0) return r;
01994 }
01995 }
01996 }
01997 else if (not1 == 0) {
01998 for (i = 0; i < n1; i++) {
01999 from1 = data1[i*2];
02000 to1 = data1[i*2+1];
02001 r = and_code_range1(pbuf, env, from1, to1, data2, n2);
02002 if (r != 0) return r;
02003 }
02004 }
02005
02006 return 0;
02007 }
02008
02009 static int
02010 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02011 {
02012 OnigEncoding enc = env->enc;
02013 int r, not1, not2;
02014 BBuf *buf1, *buf2, *pbuf = 0;
02015 BitSetRef bsr1, bsr2;
02016 BitSet bs1, bs2;
02017
02018 not1 = IS_NCCLASS_NOT(dest);
02019 bsr1 = dest->bs;
02020 buf1 = dest->mbuf;
02021 not2 = IS_NCCLASS_NOT(cc);
02022 bsr2 = cc->bs;
02023 buf2 = cc->mbuf;
02024
02025 if (not1 != 0) {
02026 bitset_invert_to(bsr1, bs1);
02027 bsr1 = bs1;
02028 }
02029 if (not2 != 0) {
02030 bitset_invert_to(bsr2, bs2);
02031 bsr2 = bs2;
02032 }
02033 bitset_and(bsr1, bsr2);
02034 if (bsr1 != dest->bs) {
02035 bitset_copy(dest->bs, bsr1);
02036 bsr1 = dest->bs;
02037 }
02038 if (not1 != 0) {
02039 bitset_invert(dest->bs);
02040 }
02041
02042 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02043 if (not1 != 0 && not2 != 0) {
02044 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
02045 }
02046 else {
02047 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
02048 if (r == 0 && not1 != 0) {
02049 BBuf *tbuf = 0;
02050 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02051 bbuf_free(pbuf);
02052 pbuf = tbuf;
02053 }
02054 }
02055 if (r != 0) {
02056 bbuf_free(pbuf);
02057 return r;
02058 }
02059
02060 dest->mbuf = pbuf;
02061 bbuf_free(buf1);
02062 return r;
02063 }
02064 return 0;
02065 }
02066
02067 static int
02068 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
02069 {
02070 OnigEncoding enc = env->enc;
02071 int r, not1, not2;
02072 BBuf *buf1, *buf2, *pbuf = 0;
02073 BitSetRef bsr1, bsr2;
02074 BitSet bs1, bs2;
02075
02076 not1 = IS_NCCLASS_NOT(dest);
02077 bsr1 = dest->bs;
02078 buf1 = dest->mbuf;
02079 not2 = IS_NCCLASS_NOT(cc);
02080 bsr2 = cc->bs;
02081 buf2 = cc->mbuf;
02082
02083 if (not1 != 0) {
02084 bitset_invert_to(bsr1, bs1);
02085 bsr1 = bs1;
02086 }
02087 if (not2 != 0) {
02088 bitset_invert_to(bsr2, bs2);
02089 bsr2 = bs2;
02090 }
02091 bitset_or(bsr1, bsr2);
02092 if (bsr1 != dest->bs) {
02093 bitset_copy(dest->bs, bsr1);
02094 bsr1 = dest->bs;
02095 }
02096 if (not1 != 0) {
02097 bitset_invert(dest->bs);
02098 }
02099
02100 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
02101 if (not1 != 0 && not2 != 0) {
02102 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
02103 }
02104 else {
02105 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
02106 if (r == 0 && not1 != 0) {
02107 BBuf *tbuf = 0;
02108 r = not_code_range_buf(enc, pbuf, &tbuf, env);
02109 bbuf_free(pbuf);
02110 pbuf = tbuf;
02111 }
02112 }
02113 if (r != 0) {
02114 bbuf_free(pbuf);
02115 return r;
02116 }
02117
02118 dest->mbuf = pbuf;
02119 bbuf_free(buf1);
02120 return r;
02121 }
02122 else
02123 return 0;
02124 }
02125
02126 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
02127
02128 static int
02129 conv_backslash_value(int c, ScanEnv* env)
02130 {
02131 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
02132 switch (c) {
02133 case 'n': return '\n';
02134 case 't': return '\t';
02135 case 'r': return '\r';
02136 case 'f': return '\f';
02137 case 'a': return '\007';
02138 case 'b': return '\010';
02139 case 'e': return '\033';
02140 case 'v':
02141 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
02142 return '\v';
02143 break;
02144
02145 default:
02146 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
02147 UNKNOWN_ESC_WARN(env, c);
02148 break;
02149 }
02150 }
02151 return c;
02152 }
02153
02154 #if 0
02155 static int
02156 is_invalid_quantifier_target(Node* node)
02157 {
02158 switch (NTYPE(node)) {
02159 case NT_ANCHOR:
02160 return 1;
02161 break;
02162
02163 case NT_ENCLOSE:
02164
02165
02166 break;
02167
02168 case NT_LIST:
02169 do {
02170 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
02171 } while (IS_NOT_NULL(node = NCDR(node)));
02172 return 0;
02173 break;
02174
02175 case NT_ALT:
02176 do {
02177 if (is_invalid_quantifier_target(NCAR(node))) return 1;
02178 } while (IS_NOT_NULL(node = NCDR(node)));
02179 break;
02180
02181 default:
02182 break;
02183 }
02184 return 0;
02185 }
02186 #else
02187 #define is_invalid_quantifier_target(node) 0
02188 #endif
02189
02190
02191 static int
02192 popular_quantifier_num(QtfrNode* q)
02193 {
02194 if (q->greedy) {
02195 if (q->lower == 0) {
02196 if (q->upper == 1) return 0;
02197 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
02198 }
02199 else if (q->lower == 1) {
02200 if (IS_REPEAT_INFINITE(q->upper)) return 2;
02201 }
02202 }
02203 else {
02204 if (q->lower == 0) {
02205 if (q->upper == 1) return 3;
02206 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
02207 }
02208 else if (q->lower == 1) {
02209 if (IS_REPEAT_INFINITE(q->upper)) return 5;
02210 }
02211 }
02212 return -1;
02213 }
02214
02215
02216 enum ReduceType {
02217 RQ_ASIS = 0,
02218 RQ_DEL = 1,
02219 RQ_A,
02220 RQ_AQ,
02221 RQ_QQ,
02222 RQ_P_QQ,
02223 RQ_PQ_Q
02224 };
02225
02226 static enum ReduceType const ReduceTypeTable[6][6] = {
02227 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS},
02228 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},
02229 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},
02230 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ},
02231 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL},
02232 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL}
02233 };
02234
02235 extern void
02236 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
02237 {
02238 int pnum, cnum;
02239 QtfrNode *p, *c;
02240
02241 p = NQTFR(pnode);
02242 c = NQTFR(cnode);
02243 pnum = popular_quantifier_num(p);
02244 cnum = popular_quantifier_num(c);
02245 if (pnum < 0 || cnum < 0) return ;
02246
02247 switch(ReduceTypeTable[cnum][pnum]) {
02248 case RQ_DEL:
02249 *pnode = *cnode;
02250 break;
02251 case RQ_A:
02252 p->target = c->target;
02253 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
02254 break;
02255 case RQ_AQ:
02256 p->target = c->target;
02257 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
02258 break;
02259 case RQ_QQ:
02260 p->target = c->target;
02261 p->lower = 0; p->upper = 1; p->greedy = 0;
02262 break;
02263 case RQ_P_QQ:
02264 p->target = cnode;
02265 p->lower = 0; p->upper = 1; p->greedy = 0;
02266 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
02267 return ;
02268 break;
02269 case RQ_PQ_Q:
02270 p->target = cnode;
02271 p->lower = 0; p->upper = 1; p->greedy = 1;
02272 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
02273 return ;
02274 break;
02275 case RQ_ASIS:
02276 p->target = cnode;
02277 return ;
02278 break;
02279 }
02280
02281 c->target = NULL_NODE;
02282 onig_node_free(cnode);
02283 }
02284
02285
02286 enum TokenSyms {
02287 TK_EOT = 0,
02288 TK_RAW_BYTE = 1,
02289 TK_CHAR,
02290 TK_STRING,
02291 TK_CODE_POINT,
02292 TK_ANYCHAR,
02293 TK_CHAR_TYPE,
02294 TK_BACKREF,
02295 TK_CALL,
02296 TK_ANCHOR,
02297 TK_OP_REPEAT,
02298 TK_INTERVAL,
02299 TK_ANYCHAR_ANYTIME,
02300 TK_ALT,
02301 TK_SUBEXP_OPEN,
02302 TK_SUBEXP_CLOSE,
02303 TK_CC_OPEN,
02304 TK_QUOTE_OPEN,
02305 TK_CHAR_PROPERTY,
02306
02307 TK_CC_CLOSE,
02308 TK_CC_RANGE,
02309 TK_POSIX_BRACKET_OPEN,
02310 TK_CC_AND,
02311 TK_CC_CC_OPEN
02312 };
02313
02314 typedef struct {
02315 enum TokenSyms type;
02316 int escaped;
02317 int base;
02318 UChar* backp;
02319 union {
02320 UChar* s;
02321 int c;
02322 OnigCodePoint code;
02323 int anchor;
02324 int subtype;
02325 struct {
02326 int lower;
02327 int upper;
02328 int greedy;
02329 int possessive;
02330 } repeat;
02331 struct {
02332 int num;
02333 int ref1;
02334 int* refs;
02335 int by_name;
02336 #ifdef USE_BACKREF_WITH_LEVEL
02337 int exist_level;
02338 int level;
02339 #endif
02340 } backref;
02341 struct {
02342 UChar* name;
02343 UChar* name_end;
02344 int gnum;
02345 } call;
02346 struct {
02347 int ctype;
02348 int not;
02349 } prop;
02350 } u;
02351 } OnigToken;
02352
02353
02354 static int
02355 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
02356 {
02357 int low, up, syn_allow, non_low = 0;
02358 int r = 0;
02359 OnigCodePoint c;
02360 OnigEncoding enc = env->enc;
02361 UChar* p = *src;
02362 PFETCH_READY;
02363
02364 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
02365
02366 if (PEND) {
02367 if (syn_allow)
02368 return 1;
02369 else
02370 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02371 }
02372
02373 if (! syn_allow) {
02374 c = PPEEK;
02375 if (c == ')' || c == '(' || c == '|') {
02376 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
02377 }
02378 }
02379
02380 low = onig_scan_unsigned_number(&p, end, env->enc);
02381 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02382 if (low > ONIG_MAX_REPEAT_NUM)
02383 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02384
02385 if (p == *src) {
02386 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
02387
02388 low = 0;
02389 non_low = 1;
02390 }
02391 else
02392 goto invalid;
02393 }
02394
02395 if (PEND) goto invalid;
02396 PFETCH(c);
02397 if (c == ',') {
02398 UChar* prev = p;
02399 up = onig_scan_unsigned_number(&p, end, env->enc);
02400 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02401 if (up > ONIG_MAX_REPEAT_NUM)
02402 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
02403
02404 if (p == prev) {
02405 if (non_low != 0)
02406 goto invalid;
02407 up = REPEAT_INFINITE;
02408 }
02409 }
02410 else {
02411 if (non_low != 0)
02412 goto invalid;
02413
02414 PUNFETCH;
02415 up = low;
02416 r = 2;
02417 }
02418
02419 if (PEND) goto invalid;
02420 PFETCH(c);
02421 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
02422 if (c != MC_ESC(env->syntax)) goto invalid;
02423 PFETCH(c);
02424 }
02425 if (c != '}') goto invalid;
02426
02427 if (!IS_REPEAT_INFINITE(up) && low > up) {
02428 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
02429 }
02430
02431 tok->type = TK_INTERVAL;
02432 tok->u.repeat.lower = low;
02433 tok->u.repeat.upper = up;
02434 *src = p;
02435 return r;
02436
02437 invalid:
02438 if (syn_allow)
02439 return 1;
02440 else
02441 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
02442 }
02443
02444
02445 static int
02446 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
02447 {
02448 int v;
02449 OnigCodePoint c;
02450 OnigEncoding enc = env->enc;
02451 UChar* p = *src;
02452 PFETCH_READY;
02453
02454 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
02455
02456 PFETCH(c);
02457 switch (c) {
02458 case 'M':
02459 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
02460 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02461 PFETCH(c);
02462 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
02463 if (PEND) return ONIGERR_END_PATTERN_AT_META;
02464 PFETCH(c);
02465 if (c == MC_ESC(env->syntax)) {
02466 v = fetch_escaped_value(&p, end, env);
02467 if (v < 0) return v;
02468 c = (OnigCodePoint )v;
02469 }
02470 c = ((c & 0xff) | 0x80);
02471 }
02472 else
02473 goto backslash;
02474 break;
02475
02476 case 'C':
02477 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
02478 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02479 PFETCH(c);
02480 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
02481 goto control;
02482 }
02483 else
02484 goto backslash;
02485
02486 case 'c':
02487 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
02488 control:
02489 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
02490 PFETCH(c);
02491 if (c == '?') {
02492 c = 0177;
02493 }
02494 else {
02495 if (c == MC_ESC(env->syntax)) {
02496 v = fetch_escaped_value(&p, end, env);
02497 if (v < 0) return v;
02498 c = (OnigCodePoint )v;
02499 }
02500 c &= 0x9f;
02501 }
02502 break;
02503 }
02504
02505
02506 default:
02507 {
02508 backslash:
02509 c = conv_backslash_value(c, env);
02510 }
02511 break;
02512 }
02513
02514 *src = p;
02515 return c;
02516 }
02517
02518 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
02519
02520 static OnigCodePoint
02521 get_name_end_code_point(OnigCodePoint start)
02522 {
02523 switch (start) {
02524 case '<': return (OnigCodePoint )'>'; break;
02525 case '\'': return (OnigCodePoint )'\''; break;
02526 default:
02527 break;
02528 }
02529
02530 return (OnigCodePoint )0;
02531 }
02532
02533 #ifdef USE_NAMED_GROUP
02534 #ifdef USE_BACKREF_WITH_LEVEL
02535
02536
02537
02538
02539
02540 static int
02541 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
02542 UChar** rname_end, ScanEnv* env,
02543 int* rback_num, int* rlevel)
02544 {
02545 int r, sign, is_num, exist_level;
02546 OnigCodePoint end_code;
02547 OnigCodePoint c = 0;
02548 OnigEncoding enc = env->enc;
02549 UChar *name_end;
02550 UChar *pnum_head;
02551 UChar *p = *src;
02552 PFETCH_READY;
02553
02554 *rback_num = 0;
02555 is_num = exist_level = 0;
02556 sign = 1;
02557 pnum_head = *src;
02558
02559 end_code = get_name_end_code_point(start_code);
02560
02561 name_end = end;
02562 r = 0;
02563 if (PEND) {
02564 return ONIGERR_EMPTY_GROUP_NAME;
02565 }
02566 else {
02567 PFETCH(c);
02568 if (c == end_code)
02569 return ONIGERR_EMPTY_GROUP_NAME;
02570
02571 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02572 is_num = 1;
02573 }
02574 else if (c == '-') {
02575 is_num = 2;
02576 sign = -1;
02577 pnum_head = p;
02578 }
02579 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02580 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02581 }
02582 }
02583
02584 while (!PEND) {
02585 name_end = p;
02586 PFETCH(c);
02587 if (c == end_code || c == ')' || c == '+' || c == '-') {
02588 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02589 break;
02590 }
02591
02592 if (is_num != 0) {
02593 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02594 is_num = 1;
02595 }
02596 else {
02597 r = ONIGERR_INVALID_GROUP_NAME;
02598 is_num = 0;
02599 }
02600 }
02601 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02602 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02603 }
02604 }
02605
02606 if (r == 0 && c != end_code) {
02607 if (c == '+' || c == '-') {
02608 int level;
02609 int flag = (c == '-' ? -1 : 1);
02610
02611 PFETCH(c);
02612 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
02613 PUNFETCH;
02614 level = onig_scan_unsigned_number(&p, end, enc);
02615 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
02616 *rlevel = (level * flag);
02617 exist_level = 1;
02618
02619 PFETCH(c);
02620 if (c == end_code)
02621 goto end;
02622 }
02623
02624 err:
02625 r = ONIGERR_INVALID_GROUP_NAME;
02626 name_end = end;
02627 }
02628
02629 end:
02630 if (r == 0) {
02631 if (is_num != 0) {
02632 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02633 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02634 else if (*rback_num == 0) goto err;
02635
02636 *rback_num *= sign;
02637 }
02638
02639 *rname_end = name_end;
02640 *src = p;
02641 return (exist_level ? 1 : 0);
02642 }
02643 else {
02644 onig_scan_env_set_error_string(env, r, *src, name_end);
02645 return r;
02646 }
02647 }
02648 #endif
02649
02650
02651
02652
02653
02654 static int
02655 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02656 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02657 {
02658 int r, is_num, sign;
02659 OnigCodePoint end_code;
02660 OnigCodePoint c = 0;
02661 OnigEncoding enc = env->enc;
02662 UChar *name_end;
02663 UChar *pnum_head;
02664 UChar *p = *src;
02665 PFETCH_READY;
02666
02667 *rback_num = 0;
02668
02669 end_code = get_name_end_code_point(start_code);
02670
02671 name_end = end;
02672 pnum_head = *src;
02673 r = 0;
02674 is_num = 0;
02675 sign = 1;
02676 if (PEND) {
02677 return ONIGERR_EMPTY_GROUP_NAME;
02678 }
02679 else {
02680 PFETCH(c);
02681 if (c == end_code)
02682 return ONIGERR_EMPTY_GROUP_NAME;
02683
02684 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02685 if (ref == 1)
02686 is_num = 1;
02687 else {
02688 r = ONIGERR_INVALID_GROUP_NAME;
02689 is_num = 0;
02690 }
02691 }
02692 else if (c == '-') {
02693 if (ref == 1) {
02694 is_num = 2;
02695 sign = -1;
02696 pnum_head = p;
02697 }
02698 else {
02699 r = ONIGERR_INVALID_GROUP_NAME;
02700 is_num = 0;
02701 }
02702 }
02703 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02704 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02705 }
02706 }
02707
02708 if (r == 0) {
02709 while (!PEND) {
02710 name_end = p;
02711 PFETCH(c);
02712 if (c == end_code || c == ')') {
02713 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
02714 break;
02715 }
02716
02717 if (is_num != 0) {
02718 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02719 is_num = 1;
02720 }
02721 else {
02722 if (!ONIGENC_IS_CODE_WORD(enc, c))
02723 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02724 else
02725 r = ONIGERR_INVALID_GROUP_NAME;
02726
02727 is_num = 0;
02728 }
02729 }
02730 else {
02731 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
02732 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02733 }
02734 }
02735 }
02736
02737 if (c != end_code) {
02738 r = ONIGERR_INVALID_GROUP_NAME;
02739 name_end = end;
02740 }
02741
02742 if (is_num != 0) {
02743 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02744 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02745 else if (*rback_num == 0) {
02746 r = ONIGERR_INVALID_GROUP_NAME;
02747 goto err;
02748 }
02749
02750 *rback_num *= sign;
02751 }
02752
02753 *rname_end = name_end;
02754 *src = p;
02755 return 0;
02756 }
02757 else {
02758 while (!PEND) {
02759 name_end = p;
02760 PFETCH(c);
02761 if (c == end_code || c == ')')
02762 break;
02763 }
02764 if (PEND)
02765 name_end = end;
02766
02767 err:
02768 onig_scan_env_set_error_string(env, r, *src, name_end);
02769 return r;
02770 }
02771 }
02772 #else
02773 static int
02774 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
02775 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
02776 {
02777 int r, is_num, sign;
02778 OnigCodePoint end_code;
02779 OnigCodePoint c = 0;
02780 UChar *name_end;
02781 OnigEncoding enc = env->enc;
02782 UChar *pnum_head;
02783 UChar *p = *src;
02784 PFETCH_READY;
02785
02786 *rback_num = 0;
02787
02788 end_code = get_name_end_code_point(start_code);
02789
02790 *rname_end = name_end = end;
02791 r = 0;
02792 pnum_head = *src;
02793 is_num = 0;
02794 sign = 1;
02795
02796 if (PEND) {
02797 return ONIGERR_EMPTY_GROUP_NAME;
02798 }
02799 else {
02800 PFETCH(c);
02801 if (c == end_code)
02802 return ONIGERR_EMPTY_GROUP_NAME;
02803
02804 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
02805 is_num = 1;
02806 }
02807 else if (c == '-') {
02808 is_num = 2;
02809 sign = -1;
02810 pnum_head = p;
02811 }
02812 else {
02813 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02814 }
02815 }
02816
02817 while (!PEND) {
02818 name_end = p;
02819
02820 PFETCH(c);
02821 if (c == end_code || c == ')') break;
02822 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
02823 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
02824 }
02825 if (r == 0 && c != end_code) {
02826 r = ONIGERR_INVALID_GROUP_NAME;
02827 name_end = end;
02828 }
02829
02830 if (r == 0) {
02831 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
02832 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
02833 else if (*rback_num == 0) {
02834 r = ONIGERR_INVALID_GROUP_NAME;
02835 goto err;
02836 }
02837 *rback_num *= sign;
02838
02839 *rname_end = name_end;
02840 *src = p;
02841 return 0;
02842 }
02843 else {
02844 err:
02845 onig_scan_env_set_error_string(env, r, *src, name_end);
02846 return r;
02847 }
02848 }
02849 #endif
02850
02851 void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
02852 UChar* pat, UChar* pat_end, const UChar *fmt, va_list args);
02853
02854 static void
02855 onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
02856 {
02857 va_list args;
02858 UChar buf[WARN_BUFSIZE];
02859 va_start(args, fmt);
02860 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
02861 env->pattern, env->pattern_end,
02862 (const UChar *)fmt, args);
02863 va_end(args);
02864 if (env->sourcefile == NULL)
02865 rb_warn("%s", (char *)buf);
02866 else
02867 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
02868 }
02869
02870 static void
02871 CC_ESC_WARN(ScanEnv *env, UChar *c)
02872 {
02873 if (onig_warn == onig_null_warn) return ;
02874
02875 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
02876 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
02877 onig_syntax_warn(env, "character class has '%s' without escape", c);
02878 }
02879 }
02880
02881 static void
02882 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
02883 {
02884 if (onig_warn == onig_null_warn) return ;
02885
02886 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
02887 onig_syntax_warn(env, "regular expression has '%s' without escape", c);
02888 }
02889 }
02890
02891 static void
02892 CC_DUP_WARN(ScanEnv *env)
02893 {
02894 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02895
02896 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_DUP) &&
02897 !((env)->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
02898 (env)->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
02899 onig_syntax_warn(env, "character class has duplicated range");
02900 }
02901 }
02902
02903 static void
02904 UNKNOWN_ESC_WARN(ScanEnv *env, int c)
02905 {
02906 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
02907 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
02908 }
02909
02910 static UChar*
02911 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
02912 UChar **next, OnigEncoding enc)
02913 {
02914 int i;
02915 OnigCodePoint x;
02916 UChar *q;
02917 UChar *p = from;
02918
02919 while (p < to) {
02920 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02921 q = p + enclen(enc, p, to);
02922 if (x == s[0]) {
02923 for (i = 1; i < n && q < to; i++) {
02924 x = ONIGENC_MBC_TO_CODE(enc, q, to);
02925 if (x != s[i]) break;
02926 q += enclen(enc, q, to);
02927 }
02928 if (i >= n) {
02929 if (IS_NOT_NULL(next))
02930 *next = q;
02931 return p;
02932 }
02933 }
02934 p = q;
02935 }
02936 return NULL_UCHARP;
02937 }
02938
02939 static int
02940 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
02941 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
02942 {
02943 int i, in_esc;
02944 OnigCodePoint x;
02945 UChar *q;
02946 UChar *p = from;
02947
02948 in_esc = 0;
02949 while (p < to) {
02950 if (in_esc) {
02951 in_esc = 0;
02952 p += enclen(enc, p, to);
02953 }
02954 else {
02955 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02956 q = p + enclen(enc, p, to);
02957 if (x == s[0]) {
02958 for (i = 1; i < n && q < to; i++) {
02959 x = ONIGENC_MBC_TO_CODE(enc, q, to);
02960 if (x != s[i]) break;
02961 q += enclen(enc, q, to);
02962 }
02963 if (i >= n) return 1;
02964 p += enclen(enc, p, to);
02965 }
02966 else {
02967 x = ONIGENC_MBC_TO_CODE(enc, p, to);
02968 if (x == bad) return 0;
02969 else if (x == MC_ESC(syn)) in_esc = 1;
02970 p = q;
02971 }
02972 }
02973 }
02974 return 0;
02975 }
02976
02977 static int
02978 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
02979 {
02980 int num;
02981 OnigCodePoint c, c2;
02982 const OnigSyntaxType* syn = env->syntax;
02983 OnigEncoding enc = env->enc;
02984 UChar* prev;
02985 UChar* p = *src;
02986 PFETCH_READY;
02987
02988 if (PEND) {
02989 tok->type = TK_EOT;
02990 return tok->type;
02991 }
02992
02993 PFETCH(c);
02994 tok->type = TK_CHAR;
02995 tok->base = 0;
02996 tok->u.c = c;
02997 tok->escaped = 0;
02998
02999 if (c == ']') {
03000 tok->type = TK_CC_CLOSE;
03001 }
03002 else if (c == '-') {
03003 tok->type = TK_CC_RANGE;
03004 }
03005 else if (c == MC_ESC(syn)) {
03006 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
03007 goto end;
03008
03009 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03010
03011 PFETCH(c);
03012 tok->escaped = 1;
03013 tok->u.c = c;
03014 switch (c) {
03015 case 'w':
03016 tok->type = TK_CHAR_TYPE;
03017 tok->u.prop.ctype = ONIGENC_CTYPE_W;
03018 tok->u.prop.not = 0;
03019 break;
03020 case 'W':
03021 tok->type = TK_CHAR_TYPE;
03022 tok->u.prop.ctype = ONIGENC_CTYPE_W;
03023 tok->u.prop.not = 1;
03024 break;
03025 case 'd':
03026 tok->type = TK_CHAR_TYPE;
03027 tok->u.prop.ctype = ONIGENC_CTYPE_D;
03028 tok->u.prop.not = 0;
03029 break;
03030 case 'D':
03031 tok->type = TK_CHAR_TYPE;
03032 tok->u.prop.ctype = ONIGENC_CTYPE_D;
03033 tok->u.prop.not = 1;
03034 break;
03035 case 's':
03036 tok->type = TK_CHAR_TYPE;
03037 tok->u.prop.ctype = ONIGENC_CTYPE_S;
03038 tok->u.prop.not = 0;
03039 break;
03040 case 'S':
03041 tok->type = TK_CHAR_TYPE;
03042 tok->u.prop.ctype = ONIGENC_CTYPE_S;
03043 tok->u.prop.not = 1;
03044 break;
03045 case 'h':
03046 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03047 tok->type = TK_CHAR_TYPE;
03048 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03049 tok->u.prop.not = 0;
03050 break;
03051 case 'H':
03052 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03053 tok->type = TK_CHAR_TYPE;
03054 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03055 tok->u.prop.not = 1;
03056 break;
03057
03058 case 'p':
03059 case 'P':
03060 c2 = PPEEK;
03061 if (c2 == '{' &&
03062 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03063 PINC;
03064 tok->type = TK_CHAR_PROPERTY;
03065 tok->u.prop.not = (c == 'P' ? 1 : 0);
03066
03067 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03068 PFETCH(c2);
03069 if (c2 == '^') {
03070 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03071 }
03072 else
03073 PUNFETCH;
03074 }
03075 }
03076 else {
03077 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03078 }
03079 break;
03080
03081 case 'x':
03082 if (PEND) break;
03083
03084 prev = p;
03085 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03086 PINC;
03087 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
03088 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03089 if (!PEND) {
03090 c2 = PPEEK;
03091 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
03092 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03093 }
03094
03095 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
03096 PINC;
03097 tok->type = TK_CODE_POINT;
03098 tok->base = 16;
03099 tok->u.code = (OnigCodePoint )num;
03100 }
03101 else {
03102
03103 p = prev;
03104 }
03105 }
03106 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03107 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
03108 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03109 if (p == prev) {
03110 num = 0;
03111 }
03112 tok->type = TK_RAW_BYTE;
03113 tok->base = 16;
03114 tok->u.c = num;
03115 }
03116 break;
03117
03118 case 'u':
03119 if (PEND) break;
03120
03121 prev = p;
03122 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03123 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
03124 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03125 if (p == prev) {
03126 num = 0;
03127 }
03128 tok->type = TK_CODE_POINT;
03129 tok->base = 16;
03130 tok->u.code = (OnigCodePoint )num;
03131 }
03132 break;
03133
03134 case '0':
03135 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
03136 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03137 PUNFETCH;
03138 prev = p;
03139 num = scan_unsigned_octal_number(&p, end, 3, enc);
03140 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03141 if (p == prev) {
03142 num = 0;
03143 }
03144 tok->type = TK_RAW_BYTE;
03145 tok->base = 8;
03146 tok->u.c = num;
03147 }
03148 break;
03149
03150 default:
03151 PUNFETCH;
03152 num = fetch_escaped_value(&p, end, env);
03153 if (num < 0) return num;
03154 if (tok->u.c != num) {
03155 tok->u.code = (OnigCodePoint )num;
03156 tok->type = TK_CODE_POINT;
03157 }
03158 break;
03159 }
03160 }
03161 else if (c == '[') {
03162 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
03163 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
03164 tok->backp = p;
03165 PINC;
03166 if (str_exist_check_with_esc(send, 2, p, end,
03167 (OnigCodePoint )']', enc, syn)) {
03168 tok->type = TK_POSIX_BRACKET_OPEN;
03169 }
03170 else {
03171 PUNFETCH;
03172 goto cc_in_cc;
03173 }
03174 }
03175 else {
03176 cc_in_cc:
03177 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
03178 tok->type = TK_CC_CC_OPEN;
03179 }
03180 else {
03181 CC_ESC_WARN(env, (UChar* )"[");
03182 }
03183 }
03184 }
03185 else if (c == '&') {
03186 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
03187 !PEND && (PPEEK_IS('&'))) {
03188 PINC;
03189 tok->type = TK_CC_AND;
03190 }
03191 }
03192
03193 end:
03194 *src = p;
03195 return tok->type;
03196 }
03197
03198 static int
03199 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
03200 {
03201 int r, num;
03202 OnigCodePoint c;
03203 OnigEncoding enc = env->enc;
03204 const OnigSyntaxType* syn = env->syntax;
03205 UChar* prev;
03206 UChar* p = *src;
03207 PFETCH_READY;
03208
03209 start:
03210 if (PEND) {
03211 tok->type = TK_EOT;
03212 return tok->type;
03213 }
03214
03215 tok->type = TK_STRING;
03216 tok->base = 0;
03217 tok->backp = p;
03218
03219 PFETCH(c);
03220 if (IS_MC_ESC_CODE(c, syn)) {
03221 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
03222
03223 tok->backp = p;
03224 PFETCH(c);
03225
03226 tok->u.c = c;
03227 tok->escaped = 1;
03228 switch (c) {
03229 case '*':
03230 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
03231 tok->type = TK_OP_REPEAT;
03232 tok->u.repeat.lower = 0;
03233 tok->u.repeat.upper = REPEAT_INFINITE;
03234 goto greedy_check;
03235 break;
03236
03237 case '+':
03238 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
03239 tok->type = TK_OP_REPEAT;
03240 tok->u.repeat.lower = 1;
03241 tok->u.repeat.upper = REPEAT_INFINITE;
03242 goto greedy_check;
03243 break;
03244
03245 case '?':
03246 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
03247 tok->type = TK_OP_REPEAT;
03248 tok->u.repeat.lower = 0;
03249 tok->u.repeat.upper = 1;
03250 greedy_check:
03251 if (!PEND && PPEEK_IS('?') &&
03252 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
03253 PFETCH(c);
03254 tok->u.repeat.greedy = 0;
03255 tok->u.repeat.possessive = 0;
03256 }
03257 else {
03258 possessive_check:
03259 if (!PEND && PPEEK_IS('+') &&
03260 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
03261 tok->type != TK_INTERVAL) ||
03262 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
03263 tok->type == TK_INTERVAL))) {
03264 PFETCH(c);
03265 tok->u.repeat.greedy = 1;
03266 tok->u.repeat.possessive = 1;
03267 }
03268 else {
03269 tok->u.repeat.greedy = 1;
03270 tok->u.repeat.possessive = 0;
03271 }
03272 }
03273 break;
03274
03275 case '{':
03276 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
03277 r = fetch_range_quantifier(&p, end, tok, env);
03278 if (r < 0) return r;
03279 if (r == 0) goto greedy_check;
03280 else if (r == 2) {
03281 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03282 goto possessive_check;
03283
03284 goto greedy_check;
03285 }
03286
03287 break;
03288
03289 case '|':
03290 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
03291 tok->type = TK_ALT;
03292 break;
03293
03294 case '(':
03295 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03296 tok->type = TK_SUBEXP_OPEN;
03297 break;
03298
03299 case ')':
03300 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
03301 tok->type = TK_SUBEXP_CLOSE;
03302 break;
03303
03304 case 'w':
03305 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03306 tok->type = TK_CHAR_TYPE;
03307 tok->u.prop.ctype = ONIGENC_CTYPE_W;
03308 tok->u.prop.not = 0;
03309 break;
03310
03311 case 'W':
03312 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
03313 tok->type = TK_CHAR_TYPE;
03314 tok->u.prop.ctype = ONIGENC_CTYPE_W;
03315 tok->u.prop.not = 1;
03316 break;
03317
03318 case 'b':
03319 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03320 tok->type = TK_ANCHOR;
03321 tok->u.anchor = ANCHOR_WORD_BOUND;
03322 break;
03323
03324 case 'B':
03325 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
03326 tok->type = TK_ANCHOR;
03327 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
03328 break;
03329
03330 #ifdef USE_WORD_BEGIN_END
03331 case '<':
03332 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03333 tok->type = TK_ANCHOR;
03334 tok->u.anchor = ANCHOR_WORD_BEGIN;
03335 break;
03336
03337 case '>':
03338 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
03339 tok->type = TK_ANCHOR;
03340 tok->u.anchor = ANCHOR_WORD_END;
03341 break;
03342 #endif
03343
03344 case 's':
03345 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03346 tok->type = TK_CHAR_TYPE;
03347 tok->u.prop.ctype = ONIGENC_CTYPE_S;
03348 tok->u.prop.not = 0;
03349 break;
03350
03351 case 'S':
03352 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
03353 tok->type = TK_CHAR_TYPE;
03354 tok->u.prop.ctype = ONIGENC_CTYPE_S;
03355 tok->u.prop.not = 1;
03356 break;
03357
03358 case 'd':
03359 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03360 tok->type = TK_CHAR_TYPE;
03361 tok->u.prop.ctype = ONIGENC_CTYPE_D;
03362 tok->u.prop.not = 0;
03363 break;
03364
03365 case 'D':
03366 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
03367 tok->type = TK_CHAR_TYPE;
03368 tok->u.prop.ctype = ONIGENC_CTYPE_D;
03369 tok->u.prop.not = 1;
03370 break;
03371
03372 case 'h':
03373 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03374 tok->type = TK_CHAR_TYPE;
03375 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03376 tok->u.prop.not = 0;
03377 break;
03378
03379 case 'H':
03380 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
03381 tok->type = TK_CHAR_TYPE;
03382 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
03383 tok->u.prop.not = 1;
03384 break;
03385
03386 case 'A':
03387 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03388 begin_buf:
03389 tok->type = TK_ANCHOR;
03390 tok->u.subtype = ANCHOR_BEGIN_BUF;
03391 break;
03392
03393 case 'Z':
03394 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03395 tok->type = TK_ANCHOR;
03396 tok->u.subtype = ANCHOR_SEMI_END_BUF;
03397 break;
03398
03399 case 'z':
03400 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
03401 end_buf:
03402 tok->type = TK_ANCHOR;
03403 tok->u.subtype = ANCHOR_END_BUF;
03404 break;
03405
03406 case 'G':
03407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
03408 tok->type = TK_ANCHOR;
03409 tok->u.subtype = ANCHOR_BEGIN_POSITION;
03410 break;
03411
03412 case '`':
03413 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03414 goto begin_buf;
03415 break;
03416
03417 case '\'':
03418 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
03419 goto end_buf;
03420 break;
03421
03422 case 'x':
03423 if (PEND) break;
03424
03425 prev = p;
03426 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
03427 PINC;
03428 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
03429 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
03430 if (!PEND) {
03431 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
03432 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
03433 }
03434
03435 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
03436 PINC;
03437 tok->type = TK_CODE_POINT;
03438 tok->u.code = (OnigCodePoint )num;
03439 }
03440 else {
03441
03442 p = prev;
03443 }
03444 }
03445 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
03446 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
03447 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03448 if (p == prev) {
03449 num = 0;
03450 }
03451 tok->type = TK_RAW_BYTE;
03452 tok->base = 16;
03453 tok->u.c = num;
03454 }
03455 break;
03456
03457 case 'u':
03458 if (PEND) break;
03459
03460 prev = p;
03461 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
03462 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
03463 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03464 if (p == prev) {
03465 num = 0;
03466 }
03467 tok->type = TK_CODE_POINT;
03468 tok->base = 16;
03469 tok->u.code = (OnigCodePoint )num;
03470 }
03471 break;
03472
03473 case '1': case '2': case '3': case '4':
03474 case '5': case '6': case '7': case '8': case '9':
03475 PUNFETCH;
03476 prev = p;
03477 num = onig_scan_unsigned_number(&p, end, enc);
03478 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
03479 goto skip_backref;
03480 }
03481
03482 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
03483 (num <= env->num_mem || num <= 9)) {
03484 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03485 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
03486 return ONIGERR_INVALID_BACKREF;
03487 }
03488
03489 tok->type = TK_BACKREF;
03490 tok->u.backref.num = 1;
03491 tok->u.backref.ref1 = num;
03492 tok->u.backref.by_name = 0;
03493 #ifdef USE_BACKREF_WITH_LEVEL
03494 tok->u.backref.exist_level = 0;
03495 #endif
03496 break;
03497 }
03498
03499 skip_backref:
03500 if (c == '8' || c == '9') {
03501
03502 p = prev; PINC;
03503 break;
03504 }
03505
03506 p = prev;
03507
03508 case '0':
03509 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
03510 prev = p;
03511 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
03512 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
03513 if (p == prev) {
03514 num = 0;
03515 }
03516 tok->type = TK_RAW_BYTE;
03517 tok->base = 8;
03518 tok->u.c = num;
03519 }
03520 else if (c != '0') {
03521 PINC;
03522 }
03523 break;
03524
03525 #ifdef USE_NAMED_GROUP
03526 case 'k':
03527 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
03528 PFETCH(c);
03529 if (c == '<' || c == '\'') {
03530 UChar* name_end;
03531 int* backs;
03532 int back_num;
03533
03534 prev = p;
03535
03536 #ifdef USE_BACKREF_WITH_LEVEL
03537 name_end = NULL_UCHARP;
03538 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
03539 env, &back_num, &tok->u.backref.level);
03540 if (r == 1) tok->u.backref.exist_level = 1;
03541 else tok->u.backref.exist_level = 0;
03542 #else
03543 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
03544 #endif
03545 if (r < 0) return r;
03546
03547 if (back_num != 0) {
03548 if (back_num < 0) {
03549 back_num = BACKREF_REL_TO_ABS(back_num, env);
03550 if (back_num <= 0)
03551 return ONIGERR_INVALID_BACKREF;
03552 }
03553
03554 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03555 if (back_num > env->num_mem ||
03556 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
03557 return ONIGERR_INVALID_BACKREF;
03558 }
03559 tok->type = TK_BACKREF;
03560 tok->u.backref.by_name = 0;
03561 tok->u.backref.num = 1;
03562 tok->u.backref.ref1 = back_num;
03563 }
03564 else {
03565 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
03566 if (num <= 0) {
03567 onig_scan_env_set_error_string(env,
03568 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
03569 return ONIGERR_UNDEFINED_NAME_REFERENCE;
03570 }
03571 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
03572 int i;
03573 for (i = 0; i < num; i++) {
03574 if (backs[i] > env->num_mem ||
03575 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
03576 return ONIGERR_INVALID_BACKREF;
03577 }
03578 }
03579
03580 tok->type = TK_BACKREF;
03581 tok->u.backref.by_name = 1;
03582 if (num == 1) {
03583 tok->u.backref.num = 1;
03584 tok->u.backref.ref1 = backs[0];
03585 }
03586 else {
03587 tok->u.backref.num = num;
03588 tok->u.backref.refs = backs;
03589 }
03590 }
03591 }
03592 else {
03593 PUNFETCH;
03594 onig_syntax_warn(env, "invalid back reference");
03595 }
03596 }
03597 break;
03598 #endif
03599
03600 #ifdef USE_SUBEXP_CALL
03601 case 'g':
03602 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
03603 PFETCH(c);
03604 if (c == '<' || c == '\'') {
03605 int gnum;
03606 UChar* name_end;
03607
03608 prev = p;
03609 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
03610 if (r < 0) return r;
03611
03612 tok->type = TK_CALL;
03613 tok->u.call.name = prev;
03614 tok->u.call.name_end = name_end;
03615 tok->u.call.gnum = gnum;
03616 }
03617 else {
03618 onig_syntax_warn(env, "invalid subexp call");
03619 PUNFETCH;
03620 }
03621 }
03622 break;
03623 #endif
03624
03625 case 'Q':
03626 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
03627 tok->type = TK_QUOTE_OPEN;
03628 }
03629 break;
03630
03631 case 'p':
03632 case 'P':
03633 if (PPEEK_IS('{') &&
03634 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
03635 PINC;
03636 tok->type = TK_CHAR_PROPERTY;
03637 tok->u.prop.not = (c == 'P' ? 1 : 0);
03638
03639 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
03640 PFETCH(c);
03641 if (c == '^') {
03642 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
03643 }
03644 else
03645 PUNFETCH;
03646 }
03647 }
03648 else {
03649 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
03650 }
03651 break;
03652
03653 default:
03654 PUNFETCH;
03655 num = fetch_escaped_value(&p, end, env);
03656 if (num < 0) return num;
03657
03658 if (tok->u.c != num) {
03659 tok->type = TK_CODE_POINT;
03660 tok->u.code = (OnigCodePoint )num;
03661 }
03662 else {
03663 p = tok->backp + enclen(enc, tok->backp, end);
03664 }
03665 break;
03666 }
03667 }
03668 else {
03669 tok->u.c = c;
03670 tok->escaped = 0;
03671
03672 #ifdef USE_VARIABLE_META_CHARS
03673 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
03674 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
03675 if (c == MC_ANYCHAR(syn))
03676 goto any_char;
03677 else if (c == MC_ANYTIME(syn))
03678 goto anytime;
03679 else if (c == MC_ZERO_OR_ONE_TIME(syn))
03680 goto zero_or_one_time;
03681 else if (c == MC_ONE_OR_MORE_TIME(syn))
03682 goto one_or_more_time;
03683 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
03684 tok->type = TK_ANYCHAR_ANYTIME;
03685 goto out;
03686 }
03687 }
03688 #endif
03689
03690 switch (c) {
03691 case '.':
03692 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
03693 #ifdef USE_VARIABLE_META_CHARS
03694 any_char:
03695 #endif
03696 tok->type = TK_ANYCHAR;
03697 break;
03698
03699 case '*':
03700 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
03701 #ifdef USE_VARIABLE_META_CHARS
03702 anytime:
03703 #endif
03704 tok->type = TK_OP_REPEAT;
03705 tok->u.repeat.lower = 0;
03706 tok->u.repeat.upper = REPEAT_INFINITE;
03707 goto greedy_check;
03708 break;
03709
03710 case '+':
03711 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
03712 #ifdef USE_VARIABLE_META_CHARS
03713 one_or_more_time:
03714 #endif
03715 tok->type = TK_OP_REPEAT;
03716 tok->u.repeat.lower = 1;
03717 tok->u.repeat.upper = REPEAT_INFINITE;
03718 goto greedy_check;
03719 break;
03720
03721 case '?':
03722 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
03723 #ifdef USE_VARIABLE_META_CHARS
03724 zero_or_one_time:
03725 #endif
03726 tok->type = TK_OP_REPEAT;
03727 tok->u.repeat.lower = 0;
03728 tok->u.repeat.upper = 1;
03729 goto greedy_check;
03730 break;
03731
03732 case '{':
03733 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
03734 r = fetch_range_quantifier(&p, end, tok, env);
03735 if (r < 0) return r;
03736 if (r == 0) goto greedy_check;
03737 else if (r == 2) {
03738 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
03739 goto possessive_check;
03740
03741 goto greedy_check;
03742 }
03743
03744 break;
03745
03746 case '|':
03747 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
03748 tok->type = TK_ALT;
03749 break;
03750
03751 case '(':
03752 if (PPEEK_IS('?') &&
03753 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
03754 PINC;
03755 if (PPEEK_IS('#')) {
03756 PFETCH(c);
03757 while (1) {
03758 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
03759 PFETCH(c);
03760 if (c == MC_ESC(syn)) {
03761 if (!PEND) PFETCH(c);
03762 }
03763 else {
03764 if (c == ')') break;
03765 }
03766 }
03767 goto start;
03768 }
03769 PUNFETCH;
03770 }
03771
03772 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03773 tok->type = TK_SUBEXP_OPEN;
03774 break;
03775
03776 case ')':
03777 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
03778 tok->type = TK_SUBEXP_CLOSE;
03779 break;
03780
03781 case '^':
03782 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03783 tok->type = TK_ANCHOR;
03784 tok->u.subtype = (IS_SINGLELINE(env->option)
03785 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
03786 break;
03787
03788 case '$':
03789 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
03790 tok->type = TK_ANCHOR;
03791 tok->u.subtype = (IS_SINGLELINE(env->option)
03792 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
03793 break;
03794
03795 case '[':
03796 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
03797 tok->type = TK_CC_OPEN;
03798 break;
03799
03800 case ']':
03801 if (*src > env->pattern)
03802 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
03803 break;
03804
03805 case '#':
03806 if (IS_EXTEND(env->option)) {
03807 while (!PEND) {
03808 PFETCH(c);
03809 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
03810 break;
03811 }
03812 goto start;
03813 break;
03814 }
03815 break;
03816
03817 case ' ': case '\t': case '\n': case '\r': case '\f':
03818 if (IS_EXTEND(env->option))
03819 goto start;
03820 break;
03821
03822 default:
03823
03824 break;
03825 }
03826 }
03827
03828 #ifdef USE_VARIABLE_META_CHARS
03829 out:
03830 #endif
03831 *src = p;
03832 return tok->type;
03833 }
03834
03835 static int
03836 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
03837 ScanEnv* env,
03838 OnigCodePoint sb_out, const OnigCodePoint mbr[])
03839 {
03840 int i, r;
03841 OnigCodePoint j;
03842
03843 int n = ONIGENC_CODE_RANGE_NUM(mbr);
03844
03845 if (not == 0) {
03846 for (i = 0; i < n; i++) {
03847 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
03848 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
03849 if (j >= sb_out) {
03850 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
03851 r = add_code_range_to_buf(&(cc->mbuf), env, j,
03852 ONIGENC_CODE_RANGE_TO(mbr, i));
03853 if (r != 0) return r;
03854 i++;
03855 }
03856
03857 goto sb_end;
03858 }
03859 BITSET_SET_BIT_CHKDUP(cc->bs, j);
03860 }
03861 }
03862
03863 sb_end:
03864 for ( ; i < n; i++) {
03865 r = add_code_range_to_buf(&(cc->mbuf), env,
03866 ONIGENC_CODE_RANGE_FROM(mbr, i),
03867 ONIGENC_CODE_RANGE_TO(mbr, i));
03868 if (r != 0) return r;
03869 }
03870 }
03871 else {
03872 OnigCodePoint prev = 0;
03873
03874 for (i = 0; i < n; i++) {
03875 for (j = prev;
03876 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
03877 if (j >= sb_out) {
03878 goto sb_end2;
03879 }
03880 BITSET_SET_BIT_CHKDUP(cc->bs, j);
03881 }
03882 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
03883 }
03884 for (j = prev; j < sb_out; j++) {
03885 BITSET_SET_BIT_CHKDUP(cc->bs, j);
03886 }
03887
03888 sb_end2:
03889 prev = sb_out;
03890
03891 for (i = 0; i < n; i++) {
03892 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
03893 r = add_code_range_to_buf(&(cc->mbuf), env, prev,
03894 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
03895 if (r != 0) return r;
03896 }
03897 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
03898 }
03899 if (prev < 0x7fffffff) {
03900 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
03901 if (r != 0) return r;
03902 }
03903 }
03904
03905 return 0;
03906 }
03907
03908 static int
03909 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
03910 {
03911 int c, r;
03912 const OnigCodePoint *ranges;
03913 OnigCodePoint sb_out;
03914 OnigEncoding enc = env->enc;
03915
03916 switch (ctype) {
03917 case ONIGENC_CTYPE_D:
03918 case ONIGENC_CTYPE_S:
03919 case ONIGENC_CTYPE_W:
03920 ctype ^= ONIGENC_CTYPE_SPECIAL_MASK;
03921 if (not != 0) {
03922 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03923 if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
03924 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03925 }
03926 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
03927 }
03928 else {
03929 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03930 if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
03931 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03932 }
03933 }
03934 return 0;
03935 break;
03936 }
03937
03938 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
03939 if (r == 0) {
03940 return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
03941 }
03942 else if (r != ONIG_NO_SUPPORT_CONFIG) {
03943 return r;
03944 }
03945
03946 r = 0;
03947 switch (ctype) {
03948 case ONIGENC_CTYPE_ALPHA:
03949 case ONIGENC_CTYPE_BLANK:
03950 case ONIGENC_CTYPE_CNTRL:
03951 case ONIGENC_CTYPE_DIGIT:
03952 case ONIGENC_CTYPE_LOWER:
03953 case ONIGENC_CTYPE_PUNCT:
03954 case ONIGENC_CTYPE_SPACE:
03955 case ONIGENC_CTYPE_UPPER:
03956 case ONIGENC_CTYPE_XDIGIT:
03957 case ONIGENC_CTYPE_ASCII:
03958 case ONIGENC_CTYPE_ALNUM:
03959 if (not != 0) {
03960 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03961 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
03962 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03963 }
03964 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
03965 }
03966 else {
03967 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03968 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
03969 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03970 }
03971 }
03972 break;
03973
03974 case ONIGENC_CTYPE_GRAPH:
03975 case ONIGENC_CTYPE_PRINT:
03976 if (not != 0) {
03977 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03978 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
03979 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03980 }
03981 }
03982 else {
03983 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03984 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
03985 BITSET_SET_BIT_CHKDUP(cc->bs, c);
03986 }
03987 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
03988 }
03989 break;
03990
03991 case ONIGENC_CTYPE_WORD:
03992 if (not == 0) {
03993 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
03994 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
03995 }
03996 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
03997 }
03998 else {
03999 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
04000 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0)
04001 && ! ONIGENC_IS_CODE_WORD(enc, c))
04002 BITSET_SET_BIT_CHKDUP(cc->bs, c);
04003 }
04004 }
04005 break;
04006
04007 default:
04008 return ONIGERR_PARSER_BUG;
04009 break;
04010 }
04011
04012 return r;
04013 }
04014
04015 static int
04016 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
04017 {
04018 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
04019 #define POSIX_BRACKET_NAME_MIN_LEN 4
04020
04021 static const PosixBracketEntryType PBS[] = {
04022 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
04023 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
04024 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
04025 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
04026 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
04027 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
04028 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
04029 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
04030 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
04031 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
04032 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
04033 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
04034 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
04035 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
04036 { (UChar* )NULL, -1, 0 }
04037 };
04038
04039 const PosixBracketEntryType *pb;
04040 int not, i, r;
04041 OnigCodePoint c;
04042 OnigEncoding enc = env->enc;
04043 UChar *p = *src;
04044 PFETCH_READY;
04045
04046 if (PPEEK_IS('^')) {
04047 PINC;
04048 not = 1;
04049 }
04050 else
04051 not = 0;
04052
04053 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
04054 goto not_posix_bracket;
04055
04056 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
04057 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
04058 p = (UChar* )onigenc_step(enc, p, end, pb->len);
04059 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
04060 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04061
04062 r = add_ctype_to_cc(cc, pb->ctype, not, env);
04063 if (r != 0) return r;
04064
04065 PINC; PINC;
04066 *src = p;
04067 return 0;
04068 }
04069 }
04070
04071 not_posix_bracket:
04072 c = 0;
04073 i = 0;
04074 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
04075 PINC;
04076 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
04077 }
04078 if (c == ':' && ! PEND) {
04079 PINC;
04080 if (! PEND) {
04081 PFETCH(c);
04082 if (c == ']')
04083 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
04084 }
04085 }
04086
04087 return 1;
04088 }
04089
04090 static int
04091 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
04092 {
04093 int r;
04094 OnigCodePoint c;
04095 OnigEncoding enc = env->enc;
04096 UChar *prev, *start, *p = *src;
04097 PFETCH_READY;
04098
04099 r = 0;
04100 start = prev = p;
04101
04102 while (!PEND) {
04103 prev = p;
04104 PFETCH(c);
04105 if (c == '}') {
04106 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
04107 if (r < 0) break;
04108
04109 *src = p;
04110 return r;
04111 }
04112 else if (c == '(' || c == ')' || c == '{' || c == '|') {
04113 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
04114 break;
04115 }
04116 }
04117
04118 onig_scan_env_set_error_string(env, r, *src, prev);
04119 return r;
04120 }
04121
04122 static int
04123 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
04124 ScanEnv* env)
04125 {
04126 int r, ctype;
04127 CClassNode* cc;
04128
04129 ctype = fetch_char_property_to_ctype(src, end, env);
04130 if (ctype < 0) return ctype;
04131
04132 *np = node_new_cclass();
04133 CHECK_NULL_RETURN_MEMERR(*np);
04134 cc = NCCLASS(*np);
04135 r = add_ctype_to_cc(cc, ctype, 0, env);
04136 if (r != 0) return r;
04137 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
04138
04139 return 0;
04140 }
04141
04142
04143 enum CCSTATE {
04144 CCS_VALUE,
04145 CCS_RANGE,
04146 CCS_COMPLETE,
04147 CCS_START
04148 };
04149
04150 enum CCVALTYPE {
04151 CCV_SB,
04152 CCV_CODE_POINT,
04153 CCV_CLASS
04154 };
04155
04156 static int
04157 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
04158 enum CCSTATE* state, ScanEnv* env)
04159 {
04160 int r;
04161
04162 if (*state == CCS_RANGE)
04163 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
04164
04165 if (*state == CCS_VALUE && *type != CCV_CLASS) {
04166 if (*type == CCV_SB)
04167 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04168 else if (*type == CCV_CODE_POINT) {
04169 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04170 if (r < 0) return r;
04171 }
04172 }
04173
04174 *state = CCS_VALUE;
04175 *type = CCV_CLASS;
04176 return 0;
04177 }
04178
04179 static int
04180 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
04181 int* vs_israw, int v_israw,
04182 enum CCVALTYPE intype, enum CCVALTYPE* type,
04183 enum CCSTATE* state, ScanEnv* env)
04184 {
04185 int r;
04186
04187 switch (*state) {
04188 case CCS_VALUE:
04189 if (*type == CCV_SB)
04190 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
04191 else if (*type == CCV_CODE_POINT) {
04192 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
04193 if (r < 0) return r;
04194 }
04195 break;
04196
04197 case CCS_RANGE:
04198 if (intype == *type) {
04199 if (intype == CCV_SB) {
04200 if (*vs > 0xff || v > 0xff)
04201 return ONIGERR_INVALID_CODE_POINT_VALUE;
04202
04203 if (*vs > v) {
04204 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04205 goto ccs_range_end;
04206 else
04207 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04208 }
04209 bitset_set_range(env, cc->bs, (int )*vs, (int )v);
04210 }
04211 else {
04212 r = add_code_range(&(cc->mbuf), env, *vs, v);
04213 if (r < 0) return r;
04214 }
04215 }
04216 else {
04217 #if 0
04218 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
04219 #endif
04220 if (*vs > v) {
04221 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
04222 goto ccs_range_end;
04223 else
04224 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
04225 }
04226 bitset_set_range(env, cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
04227 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
04228 if (r < 0) return r;
04229 #if 0
04230 }
04231 else
04232 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
04233 #endif
04234 }
04235 ccs_range_end:
04236 *state = CCS_COMPLETE;
04237 break;
04238
04239 case CCS_COMPLETE:
04240 case CCS_START:
04241 *state = CCS_VALUE;
04242 break;
04243
04244 default:
04245 break;
04246 }
04247
04248 *vs_israw = v_israw;
04249 *vs = v;
04250 *type = intype;
04251 return 0;
04252 }
04253
04254 static int
04255 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
04256 ScanEnv* env)
04257 {
04258 int in_esc;
04259 OnigCodePoint code;
04260 OnigEncoding enc = env->enc;
04261 UChar* p = from;
04262 PFETCH_READY;
04263
04264 in_esc = 0;
04265 while (! PEND) {
04266 if (ignore_escaped && in_esc) {
04267 in_esc = 0;
04268 }
04269 else {
04270 PFETCH(code);
04271 if (code == c) return 1;
04272 if (code == MC_ESC(env->syntax)) in_esc = 1;
04273 }
04274 }
04275 return 0;
04276 }
04277
04278 static int
04279 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
04280 ScanEnv* env)
04281 {
04282 int r, neg, len, fetched, and_start;
04283 OnigCodePoint v, vs;
04284 UChar *p;
04285 Node* node;
04286 CClassNode *cc, *prev_cc;
04287 CClassNode work_cc;
04288
04289 enum CCSTATE state;
04290 enum CCVALTYPE val_type, in_type;
04291 int val_israw, in_israw;
04292
04293 prev_cc = (CClassNode* )NULL;
04294 *np = NULL_NODE;
04295 r = fetch_token_in_cc(tok, src, end, env);
04296 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
04297 neg = 1;
04298 r = fetch_token_in_cc(tok, src, end, env);
04299 }
04300 else {
04301 neg = 0;
04302 }
04303
04304 if (r < 0) return r;
04305 if (r == TK_CC_CLOSE) {
04306 if (! code_exist_check((OnigCodePoint )']',
04307 *src, env->pattern_end, 1, env))
04308 return ONIGERR_EMPTY_CHAR_CLASS;
04309
04310 CC_ESC_WARN(env, (UChar* )"]");
04311 r = tok->type = TK_CHAR;
04312 }
04313
04314 *np = node = node_new_cclass();
04315 CHECK_NULL_RETURN_MEMERR(node);
04316 cc = NCCLASS(node);
04317
04318 and_start = 0;
04319 state = CCS_START;
04320 p = *src;
04321 while (r != TK_CC_CLOSE) {
04322 fetched = 0;
04323 switch (r) {
04324 case TK_CHAR:
04325 if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
04326 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
04327 in_type = CCV_CODE_POINT;
04328 }
04329 else if (len < 0) {
04330 r = len;
04331 goto err;
04332 }
04333 else {
04334 sb_char:
04335 in_type = CCV_SB;
04336 }
04337 v = (OnigCodePoint )tok->u.c;
04338 in_israw = 0;
04339 goto val_entry2;
04340 break;
04341
04342 case TK_RAW_BYTE:
04343
04344 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
04345 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
04346 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
04347 UChar* psave = p;
04348 int i, base = tok->base;
04349
04350 buf[0] = tok->u.c;
04351 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
04352 r = fetch_token_in_cc(tok, &p, end, env);
04353 if (r < 0) goto err;
04354 if (r != TK_RAW_BYTE || tok->base != base) {
04355 fetched = 1;
04356 break;
04357 }
04358 buf[i] = tok->u.c;
04359 }
04360
04361 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
04362 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04363 goto err;
04364 }
04365
04366 len = enclen(env->enc, buf, buf+i);
04367 if (i < len) {
04368 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
04369 goto err;
04370 }
04371 else if (i > len) {
04372 p = psave;
04373 for (i = 1; i < len; i++) {
04374 r = fetch_token_in_cc(tok, &p, end, env);
04375 }
04376 fetched = 0;
04377 }
04378
04379 if (i == 1) {
04380 v = (OnigCodePoint )buf[0];
04381 goto raw_single;
04382 }
04383 else {
04384 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
04385 in_type = CCV_CODE_POINT;
04386 }
04387 }
04388 else {
04389 v = (OnigCodePoint )tok->u.c;
04390 raw_single:
04391 in_type = CCV_SB;
04392 }
04393 in_israw = 1;
04394 goto val_entry2;
04395 break;
04396
04397 case TK_CODE_POINT:
04398 v = tok->u.code;
04399 in_israw = 1;
04400 val_entry:
04401 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
04402 if (len < 0) {
04403 r = len;
04404 goto err;
04405 }
04406 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
04407 val_entry2:
04408 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
04409 &state, env);
04410 if (r != 0) goto err;
04411 break;
04412
04413 case TK_POSIX_BRACKET_OPEN:
04414 r = parse_posix_bracket(cc, &p, end, env);
04415 if (r < 0) goto err;
04416 if (r == 1) {
04417 CC_ESC_WARN(env, (UChar* )"[");
04418 p = tok->backp;
04419 v = (OnigCodePoint )tok->u.c;
04420 in_israw = 0;
04421 goto val_entry;
04422 }
04423 goto next_class;
04424 break;
04425
04426 case TK_CHAR_TYPE:
04427 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
04428 if (r != 0) return r;
04429
04430 next_class:
04431 r = next_state_class(cc, &vs, &val_type, &state, env);
04432 if (r != 0) goto err;
04433 break;
04434
04435 case TK_CHAR_PROPERTY:
04436 {
04437 int ctype;
04438
04439 ctype = fetch_char_property_to_ctype(&p, end, env);
04440 if (ctype < 0) return ctype;
04441 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
04442 if (r != 0) return r;
04443 goto next_class;
04444 }
04445 break;
04446
04447 case TK_CC_RANGE:
04448 if (state == CCS_VALUE) {
04449 r = fetch_token_in_cc(tok, &p, end, env);
04450 if (r < 0) goto err;
04451 fetched = 1;
04452 if (r == TK_CC_CLOSE) {
04453 range_end_val:
04454 v = (OnigCodePoint )'-';
04455 in_israw = 0;
04456 goto val_entry;
04457 }
04458 else if (r == TK_CC_AND) {
04459 CC_ESC_WARN(env, (UChar* )"-");
04460 goto range_end_val;
04461 }
04462 state = CCS_RANGE;
04463 }
04464 else if (state == CCS_START) {
04465
04466 v = (OnigCodePoint )tok->u.c;
04467 in_israw = 0;
04468
04469 r = fetch_token_in_cc(tok, &p, end, env);
04470 if (r < 0) goto err;
04471 fetched = 1;
04472
04473 if (r == TK_CC_RANGE || and_start != 0)
04474 CC_ESC_WARN(env, (UChar* )"-");
04475
04476 goto val_entry;
04477 }
04478 else if (state == CCS_RANGE) {
04479 CC_ESC_WARN(env, (UChar* )"-");
04480 goto sb_char;
04481 }
04482 else {
04483 r = fetch_token_in_cc(tok, &p, end, env);
04484 if (r < 0) goto err;
04485 fetched = 1;
04486 if (r == TK_CC_CLOSE) goto range_end_val;
04487 else if (r == TK_CC_AND) {
04488 CC_ESC_WARN(env, (UChar* )"-");
04489 goto range_end_val;
04490 }
04491
04492 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
04493 CC_ESC_WARN(env, (UChar* )"-");
04494 goto range_end_val;
04495 }
04496 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
04497 goto err;
04498 }
04499 break;
04500
04501 case TK_CC_CC_OPEN:
04502 {
04503 Node *anode;
04504 CClassNode* acc;
04505
04506 r = parse_char_class(&anode, tok, &p, end, env);
04507 if (r == 0) {
04508 acc = NCCLASS(anode);
04509 r = or_cclass(cc, acc, env);
04510 }
04511 onig_node_free(anode);
04512 if (r != 0) goto err;
04513 }
04514 break;
04515
04516 case TK_CC_AND:
04517 {
04518 if (state == CCS_VALUE) {
04519 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04520 &val_type, &state, env);
04521 if (r != 0) goto err;
04522 }
04523
04524 and_start = 1;
04525 state = CCS_START;
04526
04527 if (IS_NOT_NULL(prev_cc)) {
04528 r = and_cclass(prev_cc, cc, env);
04529 if (r != 0) goto err;
04530 bbuf_free(cc->mbuf);
04531 }
04532 else {
04533 prev_cc = cc;
04534 cc = &work_cc;
04535 }
04536 initialize_cclass(cc);
04537 }
04538 break;
04539
04540 case TK_EOT:
04541 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
04542 goto err;
04543 break;
04544 default:
04545 r = ONIGERR_PARSER_BUG;
04546 goto err;
04547 break;
04548 }
04549
04550 if (fetched)
04551 r = tok->type;
04552 else {
04553 r = fetch_token_in_cc(tok, &p, end, env);
04554 if (r < 0) goto err;
04555 }
04556 }
04557
04558 if (state == CCS_VALUE) {
04559 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
04560 &val_type, &state, env);
04561 if (r != 0) goto err;
04562 }
04563
04564 if (IS_NOT_NULL(prev_cc)) {
04565 r = and_cclass(prev_cc, cc, env);
04566 if (r != 0) goto err;
04567 bbuf_free(cc->mbuf);
04568 cc = prev_cc;
04569 }
04570
04571 if (neg != 0)
04572 NCCLASS_SET_NOT(cc);
04573 else
04574 NCCLASS_CLEAR_NOT(cc);
04575 if (IS_NCCLASS_NOT(cc) &&
04576 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
04577 int is_empty;
04578
04579 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
04580 if (is_empty != 0)
04581 BITSET_IS_EMPTY(cc->bs, is_empty);
04582
04583 if (is_empty == 0) {
04584 #define NEWLINE_CODE 0x0a
04585
04586 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
04587 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
04588 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
04589 else
04590 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
04591 }
04592 }
04593 }
04594 *src = p;
04595 return 0;
04596
04597 err:
04598 if (cc != NCCLASS(*np))
04599 bbuf_free(cc->mbuf);
04600 return r;
04601 }
04602
04603 static int parse_subexp(Node** top, OnigToken* tok, int term,
04604 UChar** src, UChar* end, ScanEnv* env);
04605
04606 static int
04607 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
04608 ScanEnv* env)
04609 {
04610 int r, num;
04611 Node *target;
04612 OnigOptionType option;
04613 OnigCodePoint c;
04614 OnigEncoding enc = env->enc;
04615
04616 #ifdef USE_NAMED_GROUP
04617 int list_capture;
04618 #endif
04619
04620 UChar* p = *src;
04621 PFETCH_READY;
04622
04623 *np = NULL;
04624 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
04625
04626 option = env->option;
04627 if (PPEEK_IS('?') &&
04628 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
04629 PINC;
04630 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
04631
04632 PFETCH(c);
04633 switch (c) {
04634 case ':':
04635 group:
04636 r = fetch_token(tok, &p, end, env);
04637 if (r < 0) return r;
04638 r = parse_subexp(np, tok, term, &p, end, env);
04639 if (r < 0) return r;
04640 *src = p;
04641 return 1;
04642 break;
04643
04644 case '=':
04645 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
04646 break;
04647 case '!':
04648 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
04649 break;
04650 case '>':
04651 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
04652 break;
04653
04654 #ifdef USE_NAMED_GROUP
04655 case '\'':
04656 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04657 goto named_group1;
04658 }
04659 else
04660 return ONIGERR_UNDEFINED_GROUP_OPTION;
04661 break;
04662 #endif
04663
04664 case '<':
04665 PFETCH(c);
04666 if (c == '=')
04667 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
04668 else if (c == '!')
04669 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
04670 #ifdef USE_NAMED_GROUP
04671 else {
04672 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04673 UChar *name;
04674 UChar *name_end;
04675
04676 PUNFETCH;
04677 c = '<';
04678
04679 named_group1:
04680 list_capture = 0;
04681
04682 named_group2:
04683 name = p;
04684 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
04685 if (r < 0) return r;
04686
04687 num = scan_env_add_mem_entry(env);
04688 if (num < 0) return num;
04689 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
04690 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04691
04692 r = name_add(env->reg, name, name_end, num, env);
04693 if (r != 0) return r;
04694 *np = node_new_enclose_memory(env->option, 1);
04695 CHECK_NULL_RETURN_MEMERR(*np);
04696 NENCLOSE(*np)->regnum = num;
04697 if (list_capture != 0)
04698 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04699 env->num_named++;
04700 }
04701 else {
04702 return ONIGERR_UNDEFINED_GROUP_OPTION;
04703 }
04704 }
04705 #else
04706 else {
04707 return ONIGERR_UNDEFINED_GROUP_OPTION;
04708 }
04709 #endif
04710 break;
04711
04712 case '@':
04713 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
04714 #ifdef USE_NAMED_GROUP
04715 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
04716 PFETCH(c);
04717 if (c == '<' || c == '\'') {
04718 list_capture = 1;
04719 goto named_group2;
04720 }
04721 PUNFETCH;
04722 }
04723 #endif
04724 *np = node_new_enclose_memory(env->option, 0);
04725 CHECK_NULL_RETURN_MEMERR(*np);
04726 num = scan_env_add_mem_entry(env);
04727 if (num < 0) {
04728 onig_node_free(*np);
04729 return num;
04730 }
04731 else if (num >= (int )BIT_STATUS_BITS_NUM) {
04732 onig_node_free(*np);
04733 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
04734 }
04735 NENCLOSE(*np)->regnum = num;
04736 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
04737 }
04738 else {
04739 return ONIGERR_UNDEFINED_GROUP_OPTION;
04740 }
04741 break;
04742
04743 #ifdef USE_POSIXLINE_OPTION
04744 case 'p':
04745 #endif
04746 case '-': case 'i': case 'm': case 's': case 'x':
04747 {
04748 int neg = 0;
04749
04750 while (1) {
04751 switch (c) {
04752 case ':':
04753 case ')':
04754 break;
04755
04756 case '-': neg = 1; break;
04757 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
04758 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
04759 case 's':
04760 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
04761 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
04762 }
04763 else
04764 return ONIGERR_UNDEFINED_GROUP_OPTION;
04765 break;
04766
04767 case 'm':
04768 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
04769 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
04770 }
04771 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
04772 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
04773 }
04774 else
04775 return ONIGERR_UNDEFINED_GROUP_OPTION;
04776 break;
04777 #ifdef USE_POSIXLINE_OPTION
04778 case 'p':
04779 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
04780 break;
04781 #endif
04782 default:
04783 return ONIGERR_UNDEFINED_GROUP_OPTION;
04784 }
04785
04786 if (c == ')') {
04787 *np = node_new_option(option);
04788 CHECK_NULL_RETURN_MEMERR(*np);
04789 *src = p;
04790 return 2;
04791 }
04792 else if (c == ':') {
04793 OnigOptionType prev = env->option;
04794
04795 env->option = option;
04796 r = fetch_token(tok, &p, end, env);
04797 if (r < 0) return r;
04798 r = parse_subexp(&target, tok, term, &p, end, env);
04799 env->option = prev;
04800 if (r < 0) return r;
04801 *np = node_new_option(option);
04802 CHECK_NULL_RETURN_MEMERR(*np);
04803 NENCLOSE(*np)->target = target;
04804 *src = p;
04805 return 0;
04806 }
04807
04808 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
04809 PFETCH(c);
04810 }
04811 }
04812 break;
04813
04814 default:
04815 return ONIGERR_UNDEFINED_GROUP_OPTION;
04816 }
04817 }
04818 else {
04819 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
04820 goto group;
04821
04822 *np = node_new_enclose_memory(env->option, 0);
04823 CHECK_NULL_RETURN_MEMERR(*np);
04824 num = scan_env_add_mem_entry(env);
04825 if (num < 0) return num;
04826 NENCLOSE(*np)->regnum = num;
04827 }
04828
04829 CHECK_NULL_RETURN_MEMERR(*np);
04830 r = fetch_token(tok, &p, end, env);
04831 if (r < 0) return r;
04832 r = parse_subexp(&target, tok, term, &p, end, env);
04833 if (r < 0) {
04834 onig_node_free(target);
04835 return r;
04836 }
04837
04838 if (NTYPE(*np) == NT_ANCHOR)
04839 NANCHOR(*np)->target = target;
04840 else {
04841 NENCLOSE(*np)->target = target;
04842 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
04843
04844 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
04845 if (r != 0) return r;
04846 }
04847 }
04848
04849 *src = p;
04850 return 0;
04851 }
04852
04853 static const char* const PopularQStr[] = {
04854 "?", "*", "+", "??", "*?", "+?"
04855 };
04856
04857 static const char* const ReduceQStr[] = {
04858 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
04859 };
04860
04861 static int
04862 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
04863 {
04864 QtfrNode* qn;
04865
04866 qn = NQTFR(qnode);
04867 if (qn->lower == 1 && qn->upper == 1) {
04868 return 1;
04869 }
04870
04871 switch (NTYPE(target)) {
04872 case NT_STR:
04873 if (! group) {
04874 StrNode* sn = NSTR(target);
04875 if (str_node_can_be_split(sn, env->enc)) {
04876 Node* n = str_node_split_last_char(sn, env->enc);
04877 if (IS_NOT_NULL(n)) {
04878 qn->target = n;
04879 return 2;
04880 }
04881 }
04882 }
04883 break;
04884
04885 case NT_QTFR:
04886 {
04887
04888 QtfrNode* qnt = NQTFR(target);
04889 int nestq_num = popular_quantifier_num(qn);
04890 int targetq_num = popular_quantifier_num(qnt);
04891
04892 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
04893 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
04894 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
04895 UChar buf[WARN_BUFSIZE];
04896
04897 switch(ReduceTypeTable[targetq_num][nestq_num]) {
04898 case RQ_ASIS:
04899 break;
04900
04901 case RQ_DEL:
04902 if (onig_verb_warn != onig_null_warn) {
04903 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
04904 env->pattern, env->pattern_end,
04905 (UChar* )"redundant nested repeat operator");
04906 (*onig_verb_warn)((char* )buf);
04907 }
04908 goto warn_exit;
04909 break;
04910
04911 default:
04912 if (onig_verb_warn != onig_null_warn) {
04913 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
04914 env->pattern, env->pattern_end,
04915 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
04916 PopularQStr[targetq_num], PopularQStr[nestq_num],
04917 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
04918 (*onig_verb_warn)((char* )buf);
04919 }
04920 goto warn_exit;
04921 break;
04922 }
04923 }
04924
04925 warn_exit:
04926 #endif
04927 if (targetq_num >= 0) {
04928 if (nestq_num >= 0) {
04929 onig_reduce_nested_quantifier(qnode, target);
04930 goto q_exit;
04931 }
04932 else if (targetq_num == 1 || targetq_num == 2) {
04933
04934 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
04935 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
04936 }
04937 }
04938 }
04939 }
04940 break;
04941
04942 default:
04943 break;
04944 }
04945
04946 qn->target = target;
04947 q_exit:
04948 return 0;
04949 }
04950
04951
04952 #ifdef USE_SHARED_CCLASS_TABLE
04953
04954 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
04955
04956
04957
04958 typedef struct {
04959 OnigEncoding enc;
04960 int not;
04961 int type;
04962 } type_cclass_key;
04963
04964 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
04965 {
04966 if (x->type != y->type) return 1;
04967 if (x->enc != y->enc) return 1;
04968 if (x->not != y->not) return 1;
04969 return 0;
04970 }
04971
04972 static st_index_t type_cclass_hash(type_cclass_key* key)
04973 {
04974 int i, val;
04975 UChar *p;
04976
04977 val = 0;
04978
04979 p = (UChar* )&(key->enc);
04980 for (i = 0; i < (int )sizeof(key->enc); i++) {
04981 val = val * 997 + (int )*p++;
04982 }
04983
04984 p = (UChar* )(&key->type);
04985 for (i = 0; i < (int )sizeof(key->type); i++) {
04986 val = val * 997 + (int )*p++;
04987 }
04988
04989 val += key->not;
04990 return val + (val >> 5);
04991 }
04992
04993 static const struct st_hash_type type_type_cclass_hash = {
04994 type_cclass_cmp,
04995 type_cclass_hash,
04996 };
04997
04998 static st_table* OnigTypeCClassTable;
04999
05000
05001 static int
05002 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
05003 {
05004 if (IS_NOT_NULL(node)) {
05005 CClassNode* cc = NCCLASS(node);
05006 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
05007 xfree(node);
05008 }
05009
05010 if (IS_NOT_NULL(key)) xfree(key);
05011 return ST_DELETE;
05012 }
05013
05014 extern int
05015 onig_free_shared_cclass_table(void)
05016 {
05017 THREAD_ATOMIC_START;
05018 if (IS_NOT_NULL(OnigTypeCClassTable)) {
05019 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
05020 onig_st_free_table(OnigTypeCClassTable);
05021 OnigTypeCClassTable = NULL;
05022 }
05023 THREAD_ATOMIC_END;
05024
05025 return 0;
05026 }
05027
05028 #endif
05029
05030
05031 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05032 static int
05033 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
05034 {
05035 BBuf *tbuf;
05036 int r;
05037
05038 if (IS_NCCLASS_NOT(cc)) {
05039 bitset_invert(cc->bs);
05040
05041 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
05042 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
05043 if (r != 0) return r;
05044
05045 bbuf_free(cc->mbuf);
05046 cc->mbuf = tbuf;
05047 }
05048
05049 NCCLASS_CLEAR_NOT(cc);
05050 }
05051
05052 return 0;
05053 }
05054 #endif
05055
05056 typedef struct {
05057 ScanEnv* env;
05058 CClassNode* cc;
05059 Node* alt_root;
05060 Node** ptail;
05061 } IApplyCaseFoldArg;
05062
05063 static int
05064 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
05065 int to_len, void* arg)
05066 {
05067 IApplyCaseFoldArg* iarg;
05068 ScanEnv* env;
05069 CClassNode* cc;
05070 BitSetRef bs;
05071
05072 iarg = (IApplyCaseFoldArg* )arg;
05073 env = iarg->env;
05074 cc = iarg->cc;
05075 bs = cc->bs;
05076
05077 if (to_len == 1) {
05078 int is_in = onig_is_code_in_cc(env->enc, from, cc);
05079 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05080 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
05081 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
05082 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05083 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05084 }
05085 else {
05086 BITSET_SET_BIT(bs, *to);
05087 }
05088 }
05089 #else
05090 if (is_in != 0) {
05091 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
05092 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
05093 add_code_range0(&(cc->mbuf), env, *to, *to, 0);
05094 }
05095 else {
05096 if (IS_NCCLASS_NOT(cc)) {
05097 BITSET_CLEAR_BIT(bs, *to);
05098 }
05099 else
05100 BITSET_SET_BIT(bs, *to);
05101 }
05102 }
05103 #endif
05104 }
05105 else {
05106 int r, i, len;
05107 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
05108 Node *snode = NULL_NODE;
05109
05110 if (onig_is_code_in_cc(env->enc, from, cc)
05111 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
05112 && !IS_NCCLASS_NOT(cc)
05113 #endif
05114 ) {
05115 for (i = 0; i < to_len; i++) {
05116 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
05117 if (i == 0) {
05118 snode = onig_node_new_str(buf, buf + len);
05119 CHECK_NULL_RETURN_MEMERR(snode);
05120
05121
05122
05123 NSTRING_SET_AMBIG(snode);
05124 }
05125 else {
05126 r = onig_node_str_cat(snode, buf, buf + len);
05127 if (r < 0) {
05128 onig_node_free(snode);
05129 return r;
05130 }
05131 }
05132 }
05133
05134 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
05135 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
05136 iarg->ptail = &(NCDR((*(iarg->ptail))));
05137 }
05138 }
05139
05140 return 0;
05141 }
05142
05143 static int
05144 parse_exp(Node** np, OnigToken* tok, int term,
05145 UChar** src, UChar* end, ScanEnv* env)
05146 {
05147 int r, len, group = 0;
05148 Node* qn;
05149 Node** targetp;
05150
05151 *np = NULL;
05152 if (tok->type == (enum TokenSyms )term)
05153 goto end_of_token;
05154
05155 switch (tok->type) {
05156 case TK_ALT:
05157 case TK_EOT:
05158 end_of_token:
05159 *np = node_new_empty();
05160 return tok->type;
05161
05162 case TK_SUBEXP_OPEN:
05163 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
05164 if (r < 0) return r;
05165 if (r == 1) group = 1;
05166 else if (r == 2) {
05167 Node* target;
05168 OnigOptionType prev = env->option;
05169
05170 env->option = NENCLOSE(*np)->option;
05171 r = fetch_token(tok, src, end, env);
05172 if (r < 0) return r;
05173 r = parse_subexp(&target, tok, term, src, end, env);
05174 env->option = prev;
05175 if (r < 0) {
05176 onig_node_free(target);
05177 return r;
05178 }
05179 NENCLOSE(*np)->target = target;
05180 return tok->type;
05181 }
05182 break;
05183
05184 case TK_SUBEXP_CLOSE:
05185 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
05186 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
05187
05188 if (tok->escaped) goto tk_raw_byte;
05189 else goto tk_byte;
05190 break;
05191
05192 case TK_STRING:
05193 tk_byte:
05194 {
05195 *np = node_new_str(tok->backp, *src);
05196 CHECK_NULL_RETURN_MEMERR(*np);
05197
05198 while (1) {
05199 r = fetch_token(tok, src, end, env);
05200 if (r < 0) return r;
05201 if (r != TK_STRING) break;
05202
05203 r = onig_node_str_cat(*np, tok->backp, *src);
05204 if (r < 0) return r;
05205 }
05206
05207 string_end:
05208 targetp = np;
05209 goto repeat;
05210 }
05211 break;
05212
05213 case TK_RAW_BYTE:
05214 tk_raw_byte:
05215 {
05216 *np = node_new_str_raw_char((UChar )tok->u.c);
05217 CHECK_NULL_RETURN_MEMERR(*np);
05218 len = 1;
05219 while (1) {
05220 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
05221 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
05222 r = fetch_token(tok, src, end, env);
05223 NSTRING_CLEAR_RAW(*np);
05224 goto string_end;
05225 }
05226 }
05227
05228 r = fetch_token(tok, src, end, env);
05229 if (r < 0) return r;
05230 if (r != TK_RAW_BYTE) {
05231
05232 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
05233 int rem;
05234 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
05235 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
05236 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
05237 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
05238 NSTRING_CLEAR_RAW(*np);
05239 goto string_end;
05240 }
05241 }
05242 #endif
05243 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
05244 }
05245
05246 r = node_str_cat_char(*np, (UChar )tok->u.c);
05247 if (r < 0) return r;
05248
05249 len++;
05250 }
05251 }
05252 break;
05253
05254 case TK_CODE_POINT:
05255 {
05256 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
05257 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
05258 if (num < 0) return num;
05259 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
05260 *np = node_new_str_raw(buf, buf + num);
05261 #else
05262 *np = node_new_str(buf, buf + num);
05263 #endif
05264 CHECK_NULL_RETURN_MEMERR(*np);
05265 }
05266 break;
05267
05268 case TK_QUOTE_OPEN:
05269 {
05270 OnigCodePoint end_op[2];
05271 UChar *qstart, *qend, *nextp;
05272
05273 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
05274 end_op[1] = (OnigCodePoint )'E';
05275 qstart = *src;
05276 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
05277 if (IS_NULL(qend)) {
05278 nextp = qend = end;
05279 }
05280 *np = node_new_str(qstart, qend);
05281 CHECK_NULL_RETURN_MEMERR(*np);
05282 *src = nextp;
05283 }
05284 break;
05285
05286 case TK_CHAR_TYPE:
05287 {
05288 switch (tok->u.prop.ctype) {
05289 case ONIGENC_CTYPE_D:
05290 case ONIGENC_CTYPE_S:
05291 case ONIGENC_CTYPE_W:
05292 {
05293 CClassNode* cc;
05294 *np = node_new_cclass();
05295 CHECK_NULL_RETURN_MEMERR(*np);
05296 cc = NCCLASS(*np);
05297 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
05298 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
05299 }
05300 break;
05301
05302 case ONIGENC_CTYPE_WORD:
05303 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
05304 CHECK_NULL_RETURN_MEMERR(*np);
05305 break;
05306
05307 case ONIGENC_CTYPE_SPACE:
05308 case ONIGENC_CTYPE_DIGIT:
05309 case ONIGENC_CTYPE_XDIGIT:
05310 {
05311 CClassNode* cc;
05312
05313 #ifdef USE_SHARED_CCLASS_TABLE
05314 const OnigCodePoint *mbr;
05315 OnigCodePoint sb_out;
05316
05317 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
05318 &sb_out, &mbr);
05319 if (r == 0 &&
05320 ONIGENC_CODE_RANGE_NUM(mbr)
05321 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
05322 type_cclass_key key;
05323 type_cclass_key* new_key;
05324
05325 key.enc = env->enc;
05326 key.not = tok->u.prop.not;
05327 key.type = tok->u.prop.ctype;
05328
05329 THREAD_ATOMIC_START;
05330
05331 if (IS_NULL(OnigTypeCClassTable)) {
05332 OnigTypeCClassTable
05333 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
05334 if (IS_NULL(OnigTypeCClassTable)) {
05335 THREAD_ATOMIC_END;
05336 return ONIGERR_MEMORY;
05337 }
05338 }
05339 else {
05340 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
05341 (st_data_t* )np)) {
05342 THREAD_ATOMIC_END;
05343 break;
05344 }
05345 }
05346
05347 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
05348 sb_out, mbr);
05349 if (IS_NULL(*np)) {
05350 THREAD_ATOMIC_END;
05351 return ONIGERR_MEMORY;
05352 }
05353
05354 cc = NCCLASS(*np);
05355 NCCLASS_SET_SHARE(cc);
05356 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
05357 xmemcpy(new_key, &key, sizeof(type_cclass_key));
05358 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
05359 (st_data_t )*np);
05360
05361 THREAD_ATOMIC_END;
05362 }
05363 else {
05364 #endif
05365 *np = node_new_cclass();
05366 CHECK_NULL_RETURN_MEMERR(*np);
05367 cc = NCCLASS(*np);
05368 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
05369 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
05370 #ifdef USE_SHARED_CCLASS_TABLE
05371 }
05372 #endif
05373 }
05374 break;
05375
05376 default:
05377 return ONIGERR_PARSER_BUG;
05378 break;
05379 }
05380 }
05381 break;
05382
05383 case TK_CHAR_PROPERTY:
05384 r = parse_char_property(np, tok, src, end, env);
05385 if (r != 0) return r;
05386 break;
05387
05388 case TK_CC_OPEN:
05389 {
05390 CClassNode* cc;
05391
05392 r = parse_char_class(np, tok, src, end, env);
05393 if (r != 0) return r;
05394
05395 cc = NCCLASS(*np);
05396 if (IS_IGNORECASE(env->option)) {
05397 IApplyCaseFoldArg iarg;
05398
05399 iarg.env = env;
05400 iarg.cc = cc;
05401 iarg.alt_root = NULL_NODE;
05402 iarg.ptail = &(iarg.alt_root);
05403
05404 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
05405 i_apply_case_fold, &iarg);
05406 if (r != 0) {
05407 onig_node_free(iarg.alt_root);
05408 return r;
05409 }
05410 if (IS_NOT_NULL(iarg.alt_root)) {
05411 Node* work = onig_node_new_alt(*np, iarg.alt_root);
05412 if (IS_NULL(work)) {
05413 onig_node_free(iarg.alt_root);
05414 return ONIGERR_MEMORY;
05415 }
05416 *np = work;
05417 }
05418 }
05419 }
05420 break;
05421
05422 case TK_ANYCHAR:
05423 *np = node_new_anychar();
05424 CHECK_NULL_RETURN_MEMERR(*np);
05425 break;
05426
05427 case TK_ANYCHAR_ANYTIME:
05428 *np = node_new_anychar();
05429 CHECK_NULL_RETURN_MEMERR(*np);
05430 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
05431 CHECK_NULL_RETURN_MEMERR(qn);
05432 NQTFR(qn)->target = *np;
05433 *np = qn;
05434 break;
05435
05436 case TK_BACKREF:
05437 len = tok->u.backref.num;
05438 *np = node_new_backref(len,
05439 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
05440 tok->u.backref.by_name,
05441 #ifdef USE_BACKREF_WITH_LEVEL
05442 tok->u.backref.exist_level,
05443 tok->u.backref.level,
05444 #endif
05445 env);
05446 CHECK_NULL_RETURN_MEMERR(*np);
05447 break;
05448
05449 #ifdef USE_SUBEXP_CALL
05450 case TK_CALL:
05451 {
05452 int gnum = tok->u.call.gnum;
05453
05454 if (gnum < 0) {
05455 gnum = BACKREF_REL_TO_ABS(gnum, env);
05456 if (gnum <= 0)
05457 return ONIGERR_INVALID_BACKREF;
05458 }
05459 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
05460 CHECK_NULL_RETURN_MEMERR(*np);
05461 env->num_call++;
05462 }
05463 break;
05464 #endif
05465
05466 case TK_ANCHOR:
05467 *np = onig_node_new_anchor(tok->u.anchor);
05468 break;
05469
05470 case TK_OP_REPEAT:
05471 case TK_INTERVAL:
05472 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
05473 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
05474 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
05475 else
05476 *np = node_new_empty();
05477 }
05478 else {
05479 goto tk_byte;
05480 }
05481 break;
05482
05483 default:
05484 return ONIGERR_PARSER_BUG;
05485 break;
05486 }
05487
05488 {
05489 targetp = np;
05490
05491 re_entry:
05492 r = fetch_token(tok, src, end, env);
05493 if (r < 0) return r;
05494
05495 repeat:
05496 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
05497 if (is_invalid_quantifier_target(*targetp))
05498 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
05499
05500 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
05501 (r == TK_INTERVAL ? 1 : 0));
05502 CHECK_NULL_RETURN_MEMERR(qn);
05503 NQTFR(qn)->greedy = tok->u.repeat.greedy;
05504 r = set_quantifier(qn, *targetp, group, env);
05505 if (r < 0) {
05506 onig_node_free(qn);
05507 return r;
05508 }
05509
05510 if (tok->u.repeat.possessive != 0) {
05511 Node* en;
05512 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
05513 if (IS_NULL(en)) {
05514 onig_node_free(qn);
05515 return ONIGERR_MEMORY;
05516 }
05517 NENCLOSE(en)->target = qn;
05518 qn = en;
05519 }
05520
05521 if (r == 0) {
05522 *targetp = qn;
05523 }
05524 else if (r == 1) {
05525 onig_node_free(qn);
05526 }
05527 else if (r == 2) {
05528 Node *tmp;
05529
05530 *targetp = node_new_list(*targetp, NULL);
05531 if (IS_NULL(*targetp)) {
05532 onig_node_free(qn);
05533 return ONIGERR_MEMORY;
05534 }
05535 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
05536 if (IS_NULL(tmp)) {
05537 onig_node_free(qn);
05538 return ONIGERR_MEMORY;
05539 }
05540 targetp = &(NCAR(tmp));
05541 }
05542 goto re_entry;
05543 }
05544 }
05545
05546 return r;
05547 }
05548
05549 static int
05550 parse_branch(Node** top, OnigToken* tok, int term,
05551 UChar** src, UChar* end, ScanEnv* env)
05552 {
05553 int r;
05554 Node *node, **headp;
05555
05556 *top = NULL;
05557 r = parse_exp(&node, tok, term, src, end, env);
05558 if (r < 0) {
05559 onig_node_free(node);
05560 return r;
05561 }
05562
05563 if (r == TK_EOT || r == term || r == TK_ALT) {
05564 *top = node;
05565 }
05566 else {
05567 *top = node_new_list(node, NULL);
05568 headp = &(NCDR(*top));
05569 while (r != TK_EOT && r != term && r != TK_ALT) {
05570 r = parse_exp(&node, tok, term, src, end, env);
05571 if (r < 0) {
05572 onig_node_free(node);
05573 return r;
05574 }
05575
05576 if (NTYPE(node) == NT_LIST) {
05577 *headp = node;
05578 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
05579 headp = &(NCDR(node));
05580 }
05581 else {
05582 *headp = node_new_list(node, NULL);
05583 headp = &(NCDR(*headp));
05584 }
05585 }
05586 }
05587
05588 return r;
05589 }
05590
05591
05592 static int
05593 parse_subexp(Node** top, OnigToken* tok, int term,
05594 UChar** src, UChar* end, ScanEnv* env)
05595 {
05596 int r;
05597 Node *node, **headp;
05598
05599 *top = NULL;
05600 r = parse_branch(&node, tok, term, src, end, env);
05601 if (r < 0) {
05602 onig_node_free(node);
05603 return r;
05604 }
05605
05606 if (r == term) {
05607 *top = node;
05608 }
05609 else if (r == TK_ALT) {
05610 *top = onig_node_new_alt(node, NULL);
05611 headp = &(NCDR(*top));
05612 while (r == TK_ALT) {
05613 r = fetch_token(tok, src, end, env);
05614 if (r < 0) return r;
05615 r = parse_branch(&node, tok, term, src, end, env);
05616 if (r < 0) {
05617 onig_node_free(node);
05618 return r;
05619 }
05620
05621 *headp = onig_node_new_alt(node, NULL);
05622 headp = &(NCDR(*headp));
05623 }
05624
05625 if (tok->type != (enum TokenSyms )term)
05626 goto err;
05627 }
05628 else {
05629 onig_node_free(node);
05630 err:
05631 if (term == TK_SUBEXP_CLOSE)
05632 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
05633 else
05634 return ONIGERR_PARSER_BUG;
05635 }
05636
05637 return r;
05638 }
05639
05640 static int
05641 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
05642 {
05643 int r;
05644 OnigToken tok;
05645
05646 r = fetch_token(&tok, src, end, env);
05647 if (r < 0) return r;
05648 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
05649 if (r < 0) return r;
05650 return 0;
05651 }
05652
05653 extern int
05654 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
05655 regex_t* reg, ScanEnv* env)
05656 {
05657 int r;
05658 UChar* p;
05659
05660 #ifdef USE_NAMED_GROUP
05661 names_clear(reg);
05662 #endif
05663
05664 scan_env_clear(env);
05665 env->option = reg->options;
05666 env->case_fold_flag = reg->case_fold_flag;
05667 env->enc = reg->enc;
05668 env->syntax = reg->syntax;
05669 env->pattern = (UChar* )pattern;
05670 env->pattern_end = (UChar* )end;
05671 env->reg = reg;
05672
05673 *root = NULL;
05674 p = (UChar* )pattern;
05675 r = parse_regexp(root, &p, (UChar* )end, env);
05676 reg->num_mem = env->num_mem;
05677 return r;
05678 }
05679
05680 extern void
05681 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
05682 UChar* arg, UChar* arg_end)
05683 {
05684 env->error = arg;
05685 env->error_end = arg_end;
05686 }
05687