00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "internal.h"
00017 #include "regint.h"
00018 #include <ctype.h>
00019
00020 VALUE rb_eRegexpError;
00021
00022 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00023 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00024
00025 #define BEG(no) (regs->beg[(no)])
00026 #define END(no) (regs->end[(no)])
00027
00028 #if 'a' == 97
00029 static const char casetable[] = {
00030 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00031 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00032 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00033 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00034
00035 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00036
00037 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00038
00039 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00040
00041 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00042
00043 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00044
00045 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00046
00047 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00048
00049 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00050
00051 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00052
00053 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00054
00055 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00056
00057 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00058 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00059 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00060 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00061 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00062 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00063 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00064 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00065 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00066 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00067 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00068 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00069 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00070 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00071 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00072 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00073 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00074 };
00075 #else
00076 # error >>> "You lose. You will need a translation table for your character set." <<<
00077 #endif
00078
00079 int
00080 rb_memcicmp(const void *x, const void *y, long len)
00081 {
00082 const unsigned char *p1 = x, *p2 = y;
00083 int tmp;
00084
00085 while (len--) {
00086 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00087 return tmp;
00088 }
00089 return 0;
00090 }
00091
00092 #undef rb_memcmp
00093
00094 int
00095 rb_memcmp(const void *p1, const void *p2, long len)
00096 {
00097 return memcmp(p1, p2, len);
00098 }
00099
00100 static inline long
00101 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00102 {
00103 const unsigned char *x = xs, *xe = xs + m;
00104 const unsigned char *y = ys, *ye = ys + n;
00105 #ifndef VALUE_MAX
00106 # if SIZEOF_VALUE == 8
00107 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00108 # elif SIZEOF_VALUE == 4
00109 # define VALUE_MAX 0xFFFFFFFFUL
00110 # endif
00111 #endif
00112 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00113
00114 if (m > SIZEOF_VALUE)
00115 rb_bug("!!too long pattern string!!");
00116
00117
00118 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00119 hx <<= CHAR_BIT;
00120 hy <<= CHAR_BIT;
00121 hx |= *x;
00122 hy |= *y;
00123 }
00124
00125 while (hx != hy) {
00126 if (y == ye)
00127 return -1;
00128 hy <<= CHAR_BIT;
00129 hy |= *y;
00130 hy &= mask;
00131 y++;
00132 }
00133 return y - ys - m;
00134 }
00135
00136 static inline long
00137 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00138 {
00139 const unsigned char *x = xs, *xe = xs + m;
00140 const unsigned char *y = ys;
00141 VALUE i, qstable[256];
00142
00143
00144 for (i = 0; i < 256; ++i)
00145 qstable[i] = m + 1;
00146 for (; x < xe; ++x)
00147 qstable[*x] = xe - x;
00148
00149 for (; y + m <= ys + n; y += *(qstable + y[m])) {
00150 if (*xs == *y && memcmp(xs, y, m) == 0)
00151 return y - ys;
00152 }
00153 return -1;
00154 }
00155
00156 static inline unsigned int
00157 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00158 {
00159 register const unsigned int mix = 8353;
00160 register unsigned int h = *x;
00161 if (h < 0xC0) {
00162 return h + 256;
00163 }
00164 else if (h < 0xE0) {
00165 h *= mix;
00166 h += x[1];
00167 }
00168 else if (h < 0xF0) {
00169 h *= mix;
00170 h += x[1];
00171 h *= mix;
00172 h += x[2];
00173 }
00174 else if (h < 0xF5) {
00175 h *= mix;
00176 h += x[1];
00177 h *= mix;
00178 h += x[2];
00179 h *= mix;
00180 h += x[3];
00181 }
00182 else {
00183 return h + 256;
00184 }
00185 return (unsigned char)h;
00186 }
00187
00188 static inline long
00189 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00190 {
00191 const unsigned char *x = xs, *xe = xs + m;
00192 const unsigned char *y = ys;
00193 VALUE i, qstable[512];
00194
00195
00196 for (i = 0; i < 512; ++i) {
00197 qstable[i] = m + 1;
00198 }
00199 for (; x < xe; ++x) {
00200 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00201 }
00202
00203 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00204 if (*xs == *y && memcmp(xs, y, m) == 0)
00205 return y - ys;
00206 }
00207 return -1;
00208 }
00209
00210 long
00211 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00212 {
00213 const unsigned char *x = x0, *y = y0;
00214
00215 if (m > n) return -1;
00216 else if (m == n) {
00217 return memcmp(x0, y0, m) == 0 ? 0 : -1;
00218 }
00219 else if (m < 1) {
00220 return 0;
00221 }
00222 else if (m == 1) {
00223 const unsigned char *ys = y, *ye = ys + n;
00224 for (; y < ye; ++y) {
00225 if (*x == *y)
00226 return y - ys;
00227 }
00228 return -1;
00229 }
00230 else if (m <= SIZEOF_VALUE) {
00231 return rb_memsearch_ss(x0, m, y0, n);
00232 }
00233 else if (enc == rb_utf8_encoding()){
00234 return rb_memsearch_qs_utf8(x0, m, y0, n);
00235 }
00236 else {
00237 return rb_memsearch_qs(x0, m, y0, n);
00238 }
00239 }
00240
00241 #define REG_LITERAL FL_USER5
00242 #define REG_ENCODING_NONE FL_USER6
00243
00244 #define KCODE_FIXED FL_USER4
00245
00246 #define ARG_REG_OPTION_MASK \
00247 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00248 #define ARG_ENCODING_FIXED 16
00249 #define ARG_ENCODING_NONE 32
00250
00251 static int
00252 char_to_option(int c)
00253 {
00254 int val;
00255
00256 switch (c) {
00257 case 'i':
00258 val = ONIG_OPTION_IGNORECASE;
00259 break;
00260 case 'x':
00261 val = ONIG_OPTION_EXTEND;
00262 break;
00263 case 'm':
00264 val = ONIG_OPTION_MULTILINE;
00265 break;
00266 default:
00267 val = 0;
00268 break;
00269 }
00270 return val;
00271 }
00272
00273 static char *
00274 option_to_str(char str[4], int options)
00275 {
00276 char *p = str;
00277 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00278 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00279 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00280 *p = 0;
00281 return str;
00282 }
00283
00284 extern int
00285 rb_char_to_option_kcode(int c, int *option, int *kcode)
00286 {
00287 *option = 0;
00288
00289 switch (c) {
00290 case 'n':
00291 *kcode = rb_ascii8bit_encindex();
00292 return (*option = ARG_ENCODING_NONE);
00293 case 'e':
00294 *kcode = rb_enc_find_index("EUC-JP");
00295 break;
00296 case 's':
00297 *kcode = rb_enc_find_index("Windows-31J");
00298 break;
00299 case 'u':
00300 *kcode = rb_utf8_encindex();
00301 break;
00302 default:
00303 *kcode = -1;
00304 return (*option = char_to_option(c));
00305 }
00306 *option = ARG_ENCODING_FIXED;
00307 return 1;
00308 }
00309
00310 static void
00311 rb_reg_check(VALUE re)
00312 {
00313 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00314 rb_raise(rb_eTypeError, "uninitialized Regexp");
00315 }
00316 }
00317
00318 static void
00319 rb_reg_expr_str(VALUE str, const char *s, long len,
00320 rb_encoding *enc, rb_encoding *resenc)
00321 {
00322 const char *p, *pend;
00323 int cr = ENC_CODERANGE_UNKNOWN;
00324 int need_escape = 0;
00325 int c, clen;
00326
00327 p = s; pend = p + len;
00328 rb_str_coderange_scan_restartable(p, pend, enc, &cr);
00329 if (rb_enc_asciicompat(enc) &&
00330 (cr == ENC_CODERANGE_VALID || cr == ENC_CODERANGE_7BIT)) {
00331 while (p < pend) {
00332 c = rb_enc_ascget(p, pend, &clen, enc);
00333 if (c == -1) {
00334 if (enc == resenc) {
00335 p += mbclen(p, pend, enc);
00336 }
00337 else {
00338 need_escape = 1;
00339 break;
00340 }
00341 }
00342 else if (c != '/' && rb_enc_isprint(c, enc)) {
00343 p += clen;
00344 }
00345 else {
00346 need_escape = 1;
00347 break;
00348 }
00349 }
00350 }
00351 else {
00352 need_escape = 1;
00353 }
00354
00355 if (!need_escape) {
00356 rb_str_buf_cat(str, s, len);
00357 }
00358 else {
00359 int unicode_p = rb_enc_unicode_p(enc);
00360 p = s;
00361 while (p<pend) {
00362 c = rb_enc_ascget(p, pend, &clen, enc);
00363 if (c == '\\' && p+clen < pend) {
00364 int n = clen + mbclen(p+clen, pend, enc);
00365 rb_str_buf_cat(str, p, n);
00366 p += n;
00367 continue;
00368 }
00369 else if (c == '/') {
00370 char c = '\\';
00371 rb_str_buf_cat(str, &c, 1);
00372 rb_str_buf_cat(str, p, clen);
00373 }
00374 else if (c == -1) {
00375 clen = rb_enc_precise_mbclen(p, pend, enc);
00376 if (!MBCLEN_CHARFOUND_P(clen)) {
00377 c = (unsigned char)*p;
00378 clen = 1;
00379 goto hex;
00380 }
00381 if (resenc) {
00382 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00383 rb_str_buf_cat_escaped_char(str, c, unicode_p);
00384 }
00385 else {
00386 clen = MBCLEN_CHARFOUND_LEN(clen);
00387 rb_str_buf_cat(str, p, clen);
00388 }
00389 }
00390 else if (rb_enc_isprint(c, enc)) {
00391 rb_str_buf_cat(str, p, clen);
00392 }
00393 else if (!rb_enc_isspace(c, enc)) {
00394 char b[8];
00395
00396 hex:
00397 snprintf(b, sizeof(b), "\\x%02X", c);
00398 rb_str_buf_cat(str, b, 4);
00399 }
00400 else {
00401 rb_str_buf_cat(str, p, clen);
00402 }
00403 p += clen;
00404 }
00405 }
00406 }
00407
00408 static VALUE
00409 rb_reg_desc(const char *s, long len, VALUE re)
00410 {
00411 rb_encoding *enc = rb_enc_get(re);
00412 VALUE str = rb_str_buf_new2("/");
00413 rb_encoding *resenc = rb_default_internal_encoding();
00414 if (resenc == NULL) resenc = rb_default_external_encoding();
00415
00416 if (re && rb_enc_asciicompat(enc)) {
00417 rb_enc_copy(str, re);
00418 }
00419 else {
00420 rb_enc_associate(str, rb_usascii_encoding());
00421 }
00422 rb_reg_expr_str(str, s, len, enc, resenc);
00423 rb_str_buf_cat2(str, "/");
00424 if (re) {
00425 char opts[4];
00426 rb_reg_check(re);
00427 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00428 rb_str_buf_cat2(str, opts);
00429 if (RBASIC(re)->flags & REG_ENCODING_NONE)
00430 rb_str_buf_cat2(str, "n");
00431 }
00432 OBJ_INFECT(str, re);
00433 return str;
00434 }
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451 static VALUE
00452 rb_reg_source(VALUE re)
00453 {
00454 VALUE str;
00455
00456 rb_reg_check(re);
00457 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00458 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00459 return str;
00460 }
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474 static VALUE
00475 rb_reg_inspect(VALUE re)
00476 {
00477 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00478 return rb_any_to_s(re);
00479 }
00480 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00481 }
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504 static VALUE
00505 rb_reg_to_s(VALUE re)
00506 {
00507 int options, opt;
00508 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00509 long len;
00510 const UChar* ptr;
00511 VALUE str = rb_str_buf_new2("(?");
00512 char optbuf[5];
00513 rb_encoding *enc = rb_enc_get(re);
00514
00515 rb_reg_check(re);
00516
00517 rb_enc_copy(str, re);
00518 options = RREGEXP(re)->ptr->options;
00519 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00520 len = RREGEXP_SRC_LEN(re);
00521 again:
00522 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00523 int err = 1;
00524 ptr += 2;
00525 if ((len -= 2) > 0) {
00526 do {
00527 opt = char_to_option((int )*ptr);
00528 if (opt != 0) {
00529 options |= opt;
00530 }
00531 else {
00532 break;
00533 }
00534 ++ptr;
00535 } while (--len > 0);
00536 }
00537 if (len > 1 && *ptr == '-') {
00538 ++ptr;
00539 --len;
00540 do {
00541 opt = char_to_option((int )*ptr);
00542 if (opt != 0) {
00543 options &= ~opt;
00544 }
00545 else {
00546 break;
00547 }
00548 ++ptr;
00549 } while (--len > 0);
00550 }
00551 if (*ptr == ')') {
00552 --len;
00553 ++ptr;
00554 goto again;
00555 }
00556 if (*ptr == ':' && ptr[len-1] == ')') {
00557 Regexp *rp;
00558
00559 ++ptr;
00560 len -= 2;
00561 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00562 enc, OnigDefaultSyntax, NULL);
00563 onig_free(rp);
00564 }
00565 if (err) {
00566 options = RREGEXP(re)->ptr->options;
00567 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00568 len = RREGEXP_SRC_LEN(re);
00569 }
00570 }
00571
00572 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00573
00574 if ((options & embeddable) != embeddable) {
00575 optbuf[0] = '-';
00576 option_to_str(optbuf + 1, ~options);
00577 rb_str_buf_cat2(str, optbuf);
00578 }
00579
00580 rb_str_buf_cat2(str, ":");
00581 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00582 rb_str_buf_cat2(str, ")");
00583 rb_enc_copy(str, re);
00584
00585 OBJ_INFECT(str, re);
00586 return str;
00587 }
00588
00589 static void
00590 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00591 {
00592 volatile VALUE desc = rb_reg_desc(s, len, re);
00593
00594 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00595 }
00596
00597 static VALUE
00598 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00599 {
00600 char opts[6];
00601 VALUE desc = rb_str_buf_new2(err);
00602 rb_encoding *resenc = rb_default_internal_encoding();
00603 if (resenc == NULL) resenc = rb_default_external_encoding();
00604
00605 rb_enc_associate(desc, enc);
00606 rb_str_buf_cat2(desc, ": /");
00607 rb_reg_expr_str(desc, s, len, enc, resenc);
00608 opts[0] = '/';
00609 option_to_str(opts + 1, options);
00610 rb_str_buf_cat2(desc, opts);
00611 return rb_exc_new3(rb_eRegexpError, desc);
00612 }
00613
00614 static void
00615 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00616 {
00617 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00618 }
00619
00620 static VALUE
00621 rb_reg_error_desc(VALUE str, int options, const char *err)
00622 {
00623 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00624 rb_enc_get(str), options, err);
00625 }
00626
00627 static void
00628 rb_reg_raise_str(VALUE str, int options, const char *err)
00629 {
00630 rb_exc_raise(rb_reg_error_desc(str, options, err));
00631 }
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645 static VALUE
00646 rb_reg_casefold_p(VALUE re)
00647 {
00648 rb_reg_check(re);
00649 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00650 return Qfalse;
00651 }
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677 static VALUE
00678 rb_reg_options_m(VALUE re)
00679 {
00680 int options = rb_reg_options(re);
00681 return INT2NUM(options);
00682 }
00683
00684 static int
00685 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00686 int back_num, int *back_refs, OnigRegex regex, void *arg)
00687 {
00688 VALUE ary = (VALUE)arg;
00689 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00690 return 0;
00691 }
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709 static VALUE
00710 rb_reg_names(VALUE re)
00711 {
00712 VALUE ary = rb_ary_new();
00713 rb_reg_check(re);
00714 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00715 return ary;
00716 }
00717
00718 static int
00719 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00720 int back_num, int *back_refs, OnigRegex regex, void *arg)
00721 {
00722 VALUE hash = (VALUE)arg;
00723 VALUE ary = rb_ary_new2(back_num);
00724 int i;
00725
00726 for(i = 0; i < back_num; i++)
00727 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00728
00729 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00730
00731 return 0;
00732 }
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754
00755
00756 static VALUE
00757 rb_reg_named_captures(VALUE re)
00758 {
00759 VALUE hash = rb_hash_new();
00760 rb_reg_check(re);
00761 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00762 return hash;
00763 }
00764
00765 static int
00766 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00767 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00768 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00769 {
00770 int r;
00771
00772 *reg = (regex_t* )xmalloc(sizeof(regex_t));
00773 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00774
00775 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00776 if (r) goto err;
00777
00778 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00779 if (r) {
00780 err:
00781 onig_free(*reg);
00782 *reg = NULL;
00783 }
00784 return r;
00785 }
00786
00787 static Regexp*
00788 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00789 const char *sourcefile, int sourceline)
00790 {
00791 Regexp *rp;
00792 int r;
00793 OnigErrorInfo einfo;
00794
00795
00796
00797
00798
00799
00800
00801
00802 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00803 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00804 if (r) {
00805 onig_error_code_to_str((UChar*)err, r, &einfo);
00806 return 0;
00807 }
00808 return rp;
00809 }
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821
00822
00823
00824 VALUE rb_cMatch;
00825
00826 static VALUE
00827 match_alloc(VALUE klass)
00828 {
00829 NEWOBJ(match, struct RMatch);
00830 OBJSETUP(match, klass, T_MATCH);
00831
00832 match->str = 0;
00833 match->rmatch = 0;
00834 match->regexp = 0;
00835 match->rmatch = ALLOC(struct rmatch);
00836 MEMZERO(match->rmatch, struct rmatch, 1);
00837
00838 return (VALUE)match;
00839 }
00840
00841 typedef struct {
00842 long byte_pos;
00843 long char_pos;
00844 } pair_t;
00845
00846 static int
00847 pair_byte_cmp(const void *pair1, const void *pair2)
00848 {
00849 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00850 #if SIZEOF_LONG > SIZEOF_INT
00851 return diff ? diff > 0 ? 1 : -1 : 0;
00852 #else
00853 return (int)diff;
00854 #endif
00855 }
00856
00857 static void
00858 update_char_offset(VALUE match)
00859 {
00860 struct rmatch *rm = RMATCH(match)->rmatch;
00861 struct re_registers *regs;
00862 int i, num_regs, num_pos;
00863 long c;
00864 char *s, *p, *q;
00865 rb_encoding *enc;
00866 pair_t *pairs;
00867
00868 if (rm->char_offset_updated)
00869 return;
00870
00871 regs = &rm->regs;
00872 num_regs = rm->regs.num_regs;
00873
00874 if (rm->char_offset_num_allocated < num_regs) {
00875 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00876 rm->char_offset_num_allocated = num_regs;
00877 }
00878
00879 enc = rb_enc_get(RMATCH(match)->str);
00880 if (rb_enc_mbmaxlen(enc) == 1) {
00881 for (i = 0; i < num_regs; i++) {
00882 rm->char_offset[i].beg = BEG(i);
00883 rm->char_offset[i].end = END(i);
00884 }
00885 rm->char_offset_updated = 1;
00886 return;
00887 }
00888
00889 pairs = ALLOCA_N(pair_t, num_regs*2);
00890 num_pos = 0;
00891 for (i = 0; i < num_regs; i++) {
00892 if (BEG(i) < 0)
00893 continue;
00894 pairs[num_pos++].byte_pos = BEG(i);
00895 pairs[num_pos++].byte_pos = END(i);
00896 }
00897 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00898
00899 s = p = RSTRING_PTR(RMATCH(match)->str);
00900 c = 0;
00901 for (i = 0; i < num_pos; i++) {
00902 q = s + pairs[i].byte_pos;
00903 c += rb_enc_strlen(p, q, enc);
00904 pairs[i].char_pos = c;
00905 p = q;
00906 }
00907
00908 for (i = 0; i < num_regs; i++) {
00909 pair_t key, *found;
00910 if (BEG(i) < 0) {
00911 rm->char_offset[i].beg = -1;
00912 rm->char_offset[i].end = -1;
00913 continue;
00914 }
00915
00916 key.byte_pos = BEG(i);
00917 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00918 rm->char_offset[i].beg = found->char_pos;
00919
00920 key.byte_pos = END(i);
00921 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00922 rm->char_offset[i].end = found->char_pos;
00923 }
00924
00925 rm->char_offset_updated = 1;
00926 }
00927
00928 static void
00929 match_check(VALUE match)
00930 {
00931 if (!RMATCH(match)->regexp) {
00932 rb_raise(rb_eTypeError, "uninitialized Match");
00933 }
00934 }
00935
00936
00937 static VALUE
00938 match_init_copy(VALUE obj, VALUE orig)
00939 {
00940 struct rmatch *rm;
00941
00942 if (obj == orig) return obj;
00943
00944 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00945 rb_raise(rb_eTypeError, "wrong argument class");
00946 }
00947 RMATCH(obj)->str = RMATCH(orig)->str;
00948 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00949
00950 rm = RMATCH(obj)->rmatch;
00951 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00952
00953 if (!RMATCH(orig)->rmatch->char_offset_updated) {
00954 rm->char_offset_updated = 0;
00955 }
00956 else {
00957 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00958 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00959 rm->char_offset_num_allocated = rm->regs.num_regs;
00960 }
00961 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00962 struct rmatch_offset, rm->regs.num_regs);
00963 rm->char_offset_updated = 1;
00964 }
00965
00966 return obj;
00967 }
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979
00980 static VALUE
00981 match_regexp(VALUE match)
00982 {
00983 match_check(match);
00984 return RMATCH(match)->regexp;
00985 }
00986
00987
00988
00989
00990
00991
00992
00993
00994
00995
00996
00997
00998
00999
01000
01001 static VALUE
01002 match_names(VALUE match)
01003 {
01004 match_check(match);
01005 return rb_reg_names(RMATCH(match)->regexp);
01006 }
01007
01008
01009
01010
01011
01012
01013
01014
01015
01016
01017
01018
01019
01020 static VALUE
01021 match_size(VALUE match)
01022 {
01023 match_check(match);
01024 return INT2FIX(RMATCH_REGS(match)->num_regs);
01025 }
01026
01027 static int
01028 match_backref_number(VALUE match, VALUE backref)
01029 {
01030 const char *name;
01031 int num;
01032
01033 struct re_registers *regs = RMATCH_REGS(match);
01034 VALUE regexp = RMATCH(match)->regexp;
01035
01036 match_check(match);
01037 switch(TYPE(backref)) {
01038 default:
01039 return NUM2INT(backref);
01040
01041 case T_SYMBOL:
01042 name = rb_id2name(SYM2ID(backref));
01043 break;
01044
01045 case T_STRING:
01046 name = StringValueCStr(backref);
01047 break;
01048 }
01049
01050 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01051 (const unsigned char*)name,
01052 (const unsigned char*)name + strlen(name),
01053 regs);
01054
01055 if (num < 1) {
01056 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01057 }
01058
01059 return num;
01060 }
01061
01062 int
01063 rb_reg_backref_number(VALUE match, VALUE backref)
01064 {
01065 return match_backref_number(match, backref);
01066 }
01067
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083
01084
01085
01086 static VALUE
01087 match_offset(VALUE match, VALUE n)
01088 {
01089 int i = match_backref_number(match, n);
01090 struct re_registers *regs = RMATCH_REGS(match);
01091
01092 match_check(match);
01093 if (i < 0 || regs->num_regs <= i)
01094 rb_raise(rb_eIndexError, "index %d out of matches", i);
01095
01096 if (BEG(i) < 0)
01097 return rb_assoc_new(Qnil, Qnil);
01098
01099 update_char_offset(match);
01100 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01101 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01102 }
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121
01122 static VALUE
01123 match_begin(VALUE match, VALUE n)
01124 {
01125 int i = match_backref_number(match, n);
01126 struct re_registers *regs = RMATCH_REGS(match);
01127
01128 match_check(match);
01129 if (i < 0 || regs->num_regs <= i)
01130 rb_raise(rb_eIndexError, "index %d out of matches", i);
01131
01132 if (BEG(i) < 0)
01133 return Qnil;
01134
01135 update_char_offset(match);
01136 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01137 }
01138
01139
01140
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157 static VALUE
01158 match_end(VALUE match, VALUE n)
01159 {
01160 int i = match_backref_number(match, n);
01161 struct re_registers *regs = RMATCH_REGS(match);
01162
01163 match_check(match);
01164 if (i < 0 || regs->num_regs <= i)
01165 rb_raise(rb_eIndexError, "index %d out of matches", i);
01166
01167 if (BEG(i) < 0)
01168 return Qnil;
01169
01170 update_char_offset(match);
01171 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01172 }
01173
01174 #define MATCH_BUSY FL_USER2
01175
01176 void
01177 rb_match_busy(VALUE match)
01178 {
01179 FL_SET(match, MATCH_BUSY);
01180 }
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210
01211 static VALUE
01212 rb_reg_fixed_encoding_p(VALUE re)
01213 {
01214 if (FL_TEST(re, KCODE_FIXED))
01215 return Qtrue;
01216 else
01217 return Qfalse;
01218 }
01219
01220 static VALUE
01221 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01222 rb_encoding **fixed_enc, onig_errmsg_buffer err);
01223
01224
01225 static void
01226 reg_enc_error(VALUE re, VALUE str)
01227 {
01228 rb_raise(rb_eEncCompatError,
01229 "incompatible encoding regexp match (%s regexp with %s string)",
01230 rb_enc_name(rb_enc_get(re)),
01231 rb_enc_name(rb_enc_get(str)));
01232 }
01233
01234 static rb_encoding*
01235 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01236 {
01237 rb_encoding *enc = 0;
01238
01239 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01240 rb_raise(rb_eArgError,
01241 "invalid byte sequence in %s",
01242 rb_enc_name(rb_enc_get(str)));
01243 }
01244
01245 rb_reg_check(re);
01246 enc = rb_enc_get(str);
01247 if (!rb_enc_str_asciicompat_p(str)) {
01248 if (RREGEXP(re)->ptr->enc != enc) {
01249 reg_enc_error(re, str);
01250 }
01251 }
01252 else if (rb_reg_fixed_encoding_p(re)) {
01253 if (RREGEXP(re)->ptr->enc != enc &&
01254 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01255 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01256 reg_enc_error(re, str);
01257 }
01258 enc = RREGEXP(re)->ptr->enc;
01259 }
01260 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01261 enc != rb_ascii8bit_encoding() &&
01262 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01263 rb_warn("regexp match /.../n against to %s string",
01264 rb_enc_name(enc));
01265 }
01266 return enc;
01267 }
01268
01269 regex_t *
01270 rb_reg_prepare_re(VALUE re, VALUE str)
01271 {
01272 regex_t *reg = RREGEXP(re)->ptr;
01273 onig_errmsg_buffer err = "";
01274 int r;
01275 OnigErrorInfo einfo;
01276 const char *pattern;
01277 VALUE unescaped;
01278 rb_encoding *fixed_enc = 0;
01279 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01280
01281 if (reg->enc == enc) return reg;
01282
01283 rb_reg_check(re);
01284 reg = RREGEXP(re)->ptr;
01285 pattern = RREGEXP_SRC_PTR(re);
01286
01287 unescaped = rb_reg_preprocess(
01288 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01289 &fixed_enc, err);
01290
01291 if (unescaped == Qnil) {
01292 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01293 }
01294
01295 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped),
01296 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01297 reg->options, enc,
01298 OnigDefaultSyntax, &einfo);
01299 if (r) {
01300 onig_error_code_to_str((UChar*)err, r, &einfo);
01301 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01302 }
01303
01304 RB_GC_GUARD(unescaped);
01305 return reg;
01306 }
01307
01308 long
01309 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01310 {
01311 long range;
01312 rb_encoding *enc;
01313 UChar *p, *string;
01314
01315 enc = rb_reg_prepare_enc(re, str, 0);
01316
01317 if (reverse) {
01318 range = -pos;
01319 }
01320 else {
01321 range = RSTRING_LEN(str) - pos;
01322 }
01323
01324 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01325 string = (UChar*)RSTRING_PTR(str);
01326
01327 if (range > 0) {
01328 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01329 }
01330 else {
01331 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01332 }
01333 return p - string;
01334 }
01335
01336 return pos;
01337 }
01338
01339 long
01340 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01341 {
01342 long result;
01343 VALUE match;
01344 struct re_registers regi, *regs = ®i;
01345 char *range = RSTRING_PTR(str);
01346 regex_t *reg;
01347 int tmpreg;
01348
01349 if (pos > RSTRING_LEN(str) || pos < 0) {
01350 rb_backref_set(Qnil);
01351 return -1;
01352 }
01353
01354 reg = rb_reg_prepare_re(re, str);
01355 tmpreg = reg != RREGEXP(re)->ptr;
01356 if (!tmpreg) RREGEXP(re)->usecnt++;
01357
01358 match = rb_backref_get();
01359 if (!NIL_P(match)) {
01360 if (FL_TEST(match, MATCH_BUSY)) {
01361 match = Qnil;
01362 }
01363 else {
01364 regs = RMATCH_REGS(match);
01365 }
01366 }
01367 if (NIL_P(match)) {
01368 MEMZERO(regs, struct re_registers, 1);
01369 }
01370 if (!reverse) {
01371 range += RSTRING_LEN(str);
01372 }
01373 result = onig_search(reg,
01374 (UChar*)(RSTRING_PTR(str)),
01375 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01376 ((UChar*)(RSTRING_PTR(str)) + pos),
01377 ((UChar*)range),
01378 regs, ONIG_OPTION_NONE);
01379 if (!tmpreg) RREGEXP(re)->usecnt--;
01380 if (tmpreg) {
01381 if (RREGEXP(re)->usecnt) {
01382 onig_free(reg);
01383 }
01384 else {
01385 onig_free(RREGEXP(re)->ptr);
01386 RREGEXP(re)->ptr = reg;
01387 }
01388 }
01389 if (result < 0) {
01390 if (regs == ®i)
01391 onig_region_free(regs, 0);
01392 if (result == ONIG_MISMATCH) {
01393 rb_backref_set(Qnil);
01394 return result;
01395 }
01396 else {
01397 onig_errmsg_buffer err = "";
01398 onig_error_code_to_str((UChar*)err, (int)result);
01399 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01400 }
01401 }
01402
01403 if (NIL_P(match)) {
01404 match = match_alloc(rb_cMatch);
01405 onig_region_copy(RMATCH_REGS(match), regs);
01406 onig_region_free(regs, 0);
01407 }
01408 else {
01409 if (rb_safe_level() >= 3)
01410 OBJ_TAINT(match);
01411 else
01412 FL_UNSET(match, FL_TAINT);
01413 }
01414
01415 RMATCH(match)->str = rb_str_new4(str);
01416 RMATCH(match)->regexp = re;
01417 RMATCH(match)->rmatch->char_offset_updated = 0;
01418 rb_backref_set(match);
01419
01420 OBJ_INFECT(match, re);
01421 OBJ_INFECT(match, str);
01422
01423 return result;
01424 }
01425
01426 VALUE
01427 rb_reg_nth_defined(int nth, VALUE match)
01428 {
01429 struct re_registers *regs;
01430 if (NIL_P(match)) return Qnil;
01431 match_check(match);
01432 regs = RMATCH_REGS(match);
01433 if (nth >= regs->num_regs) {
01434 return Qnil;
01435 }
01436 if (nth < 0) {
01437 nth += regs->num_regs;
01438 if (nth <= 0) return Qnil;
01439 }
01440 if (BEG(nth) == -1) return Qfalse;
01441 return Qtrue;
01442 }
01443
01444 VALUE
01445 rb_reg_nth_match(int nth, VALUE match)
01446 {
01447 VALUE str;
01448 long start, end, len;
01449 struct re_registers *regs;
01450
01451 if (NIL_P(match)) return Qnil;
01452 match_check(match);
01453 regs = RMATCH_REGS(match);
01454 if (nth >= regs->num_regs) {
01455 return Qnil;
01456 }
01457 if (nth < 0) {
01458 nth += regs->num_regs;
01459 if (nth <= 0) return Qnil;
01460 }
01461 start = BEG(nth);
01462 if (start == -1) return Qnil;
01463 end = END(nth);
01464 len = end - start;
01465 str = rb_str_subseq(RMATCH(match)->str, start, len);
01466 OBJ_INFECT(str, match);
01467 return str;
01468 }
01469
01470 VALUE
01471 rb_reg_last_match(VALUE match)
01472 {
01473 return rb_reg_nth_match(0, match);
01474 }
01475
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488 VALUE
01489 rb_reg_match_pre(VALUE match)
01490 {
01491 VALUE str;
01492 struct re_registers *regs;
01493
01494 if (NIL_P(match)) return Qnil;
01495 match_check(match);
01496 regs = RMATCH_REGS(match);
01497 if (BEG(0) == -1) return Qnil;
01498 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01499 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01500 return str;
01501 }
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513
01514
01515 VALUE
01516 rb_reg_match_post(VALUE match)
01517 {
01518 VALUE str;
01519 long pos;
01520 struct re_registers *regs;
01521
01522 if (NIL_P(match)) return Qnil;
01523 match_check(match);
01524 regs = RMATCH_REGS(match);
01525 if (BEG(0) == -1) return Qnil;
01526 str = RMATCH(match)->str;
01527 pos = END(0);
01528 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01529 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01530 return str;
01531 }
01532
01533 VALUE
01534 rb_reg_match_last(VALUE match)
01535 {
01536 int i;
01537 struct re_registers *regs;
01538
01539 if (NIL_P(match)) return Qnil;
01540 match_check(match);
01541 regs = RMATCH_REGS(match);
01542 if (BEG(0) == -1) return Qnil;
01543
01544 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01545 ;
01546 if (i == 0) return Qnil;
01547 return rb_reg_nth_match(i, match);
01548 }
01549
01550 static VALUE
01551 last_match_getter(void)
01552 {
01553 return rb_reg_last_match(rb_backref_get());
01554 }
01555
01556 static VALUE
01557 prematch_getter(void)
01558 {
01559 return rb_reg_match_pre(rb_backref_get());
01560 }
01561
01562 static VALUE
01563 postmatch_getter(void)
01564 {
01565 return rb_reg_match_post(rb_backref_get());
01566 }
01567
01568 static VALUE
01569 last_paren_match_getter(void)
01570 {
01571 return rb_reg_match_last(rb_backref_get());
01572 }
01573
01574 static VALUE
01575 match_array(VALUE match, int start)
01576 {
01577 struct re_registers *regs;
01578 VALUE ary;
01579 VALUE target;
01580 int i;
01581 int taint = OBJ_TAINTED(match);
01582
01583 match_check(match);
01584 regs = RMATCH_REGS(match);
01585 ary = rb_ary_new2(regs->num_regs);
01586 target = RMATCH(match)->str;
01587
01588 for (i=start; i<regs->num_regs; i++) {
01589 if (regs->beg[i] == -1) {
01590 rb_ary_push(ary, Qnil);
01591 }
01592 else {
01593 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01594 if (taint) OBJ_TAINT(str);
01595 rb_ary_push(ary, str);
01596 }
01597 }
01598 return ary;
01599 }
01600
01601
01602
01603
01604
01605
01606
01607
01608
01609
01610
01611
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628 static VALUE
01629 match_to_a(VALUE match)
01630 {
01631 return match_array(match, 0);
01632 }
01633
01634
01635
01636
01637
01638
01639
01640
01641
01642
01643
01644
01645
01646
01647 static VALUE
01648 match_captures(VALUE match)
01649 {
01650 return match_array(match, 1);
01651 }
01652
01653 static int
01654 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01655 {
01656 int num;
01657
01658 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01659 (const unsigned char* )name, (const unsigned char* )name_end, regs);
01660 if (num >= 1) {
01661 return num;
01662 }
01663 else {
01664 VALUE s = rb_str_new(name, (long )(name_end - name));
01665 rb_raise(rb_eIndexError, "undefined group name reference: %s",
01666 StringValuePtr(s));
01667 }
01668 }
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696 static VALUE
01697 match_aref(int argc, VALUE *argv, VALUE match)
01698 {
01699 VALUE idx, rest;
01700
01701 match_check(match);
01702 rb_scan_args(argc, argv, "11", &idx, &rest);
01703
01704 if (NIL_P(rest)) {
01705 if (FIXNUM_P(idx)) {
01706 if (FIX2INT(idx) >= 0) {
01707 return rb_reg_nth_match(FIX2INT(idx), match);
01708 }
01709 }
01710 else {
01711 const char *p;
01712 int num;
01713
01714 switch (TYPE(idx)) {
01715 case T_SYMBOL:
01716 p = rb_id2name(SYM2ID(idx));
01717 goto name_to_backref;
01718 break;
01719 case T_STRING:
01720 p = StringValuePtr(idx);
01721
01722 name_to_backref:
01723 num = name_to_backref_number(RMATCH_REGS(match),
01724 RMATCH(match)->regexp, p, p + strlen(p));
01725 return rb_reg_nth_match(num, match);
01726 break;
01727
01728 default:
01729 break;
01730 }
01731 }
01732 }
01733
01734 return rb_ary_aref(argc, argv, match_to_a(match));
01735 }
01736
01737 static VALUE
01738 match_entry(VALUE match, long n)
01739 {
01740
01741 return rb_reg_nth_match((int)n, match);
01742 }
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758 static VALUE
01759 match_values_at(int argc, VALUE *argv, VALUE match)
01760 {
01761 struct re_registers *regs;
01762
01763 match_check(match);
01764 regs = RMATCH_REGS(match);
01765 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01766 }
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778
01779 static VALUE
01780 match_to_s(VALUE match)
01781 {
01782 VALUE str = rb_reg_last_match(match);
01783
01784 match_check(match);
01785 if (NIL_P(str)) str = rb_str_new(0,0);
01786 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01787 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01788 return str;
01789 }
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802 static VALUE
01803 match_string(VALUE match)
01804 {
01805 match_check(match);
01806 return RMATCH(match)->str;
01807 }
01808
01809 struct backref_name_tag {
01810 const UChar *name;
01811 long len;
01812 };
01813
01814 static int
01815 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01816 int back_num, int *back_refs, OnigRegex regex, void *arg0)
01817 {
01818 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01819 int i;
01820
01821 for (i = 0; i < back_num; i++) {
01822 arg[back_refs[i]].name = name;
01823 arg[back_refs[i]].len = name_end - name;
01824 }
01825 return 0;
01826 }
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847
01848 static VALUE
01849 match_inspect(VALUE match)
01850 {
01851 const char *cname = rb_obj_classname(match);
01852 VALUE str;
01853 int i;
01854 struct re_registers *regs = RMATCH_REGS(match);
01855 int num_regs = regs->num_regs;
01856 struct backref_name_tag *names;
01857 VALUE regexp = RMATCH(match)->regexp;
01858
01859 if (regexp == 0) {
01860 return rb_sprintf("#<%s:%p>", cname, (void*)match);
01861 }
01862
01863 names = ALLOCA_N(struct backref_name_tag, num_regs);
01864 MEMZERO(names, struct backref_name_tag, num_regs);
01865
01866 onig_foreach_name(RREGEXP(regexp)->ptr,
01867 match_inspect_name_iter, names);
01868
01869 str = rb_str_buf_new2("#<");
01870 rb_str_buf_cat2(str, cname);
01871
01872 for (i = 0; i < num_regs; i++) {
01873 VALUE v;
01874 rb_str_buf_cat2(str, " ");
01875 if (0 < i) {
01876 if (names[i].name)
01877 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01878 else {
01879 rb_str_catf(str, "%d", i);
01880 }
01881 rb_str_buf_cat2(str, ":");
01882 }
01883 v = rb_reg_nth_match(i, match);
01884 if (v == Qnil)
01885 rb_str_buf_cat2(str, "nil");
01886 else
01887 rb_str_buf_append(str, rb_str_inspect(v));
01888 }
01889 rb_str_buf_cat2(str, ">");
01890
01891 return str;
01892 }
01893
01894 VALUE rb_cRegexp;
01895
01896 static int
01897 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01898 {
01899 const char *p = *pp;
01900 int code;
01901 int meta_prefix = 0, ctrl_prefix = 0;
01902 size_t len;
01903
01904 if (p == end || *p++ != '\\') {
01905 errcpy(err, "too short escaped multibyte character");
01906 return -1;
01907 }
01908
01909 again:
01910 if (p == end) {
01911 errcpy(err, "too short escape sequence");
01912 return -1;
01913 }
01914 switch (*p++) {
01915 case '\\': code = '\\'; break;
01916 case 'n': code = '\n'; break;
01917 case 't': code = '\t'; break;
01918 case 'r': code = '\r'; break;
01919 case 'f': code = '\f'; break;
01920 case 'v': code = '\013'; break;
01921 case 'a': code = '\007'; break;
01922 case 'e': code = '\033'; break;
01923
01924
01925 case '0': case '1': case '2': case '3':
01926 case '4': case '5': case '6': case '7':
01927 p--;
01928 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01929 p += len;
01930 break;
01931
01932 case 'x':
01933 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01934 if (len < 1) {
01935 errcpy(err, "invalid hex escape");
01936 return -1;
01937 }
01938 p += len;
01939 break;
01940
01941 case 'M':
01942 if (meta_prefix) {
01943 errcpy(err, "duplicate meta escape");
01944 return -1;
01945 }
01946 meta_prefix = 1;
01947 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01948 if (*p == '\\') {
01949 p++;
01950 goto again;
01951 }
01952 else {
01953 code = *p++;
01954 break;
01955 }
01956 }
01957 errcpy(err, "too short meta escape");
01958 return -1;
01959
01960 case 'C':
01961 if (p == end || *p++ != '-') {
01962 errcpy(err, "too short control escape");
01963 return -1;
01964 }
01965 case 'c':
01966 if (ctrl_prefix) {
01967 errcpy(err, "duplicate control escape");
01968 return -1;
01969 }
01970 ctrl_prefix = 1;
01971 if (p < end && (*p & 0x80) == 0) {
01972 if (*p == '\\') {
01973 p++;
01974 goto again;
01975 }
01976 else {
01977 code = *p++;
01978 break;
01979 }
01980 }
01981 errcpy(err, "too short control escape");
01982 return -1;
01983
01984 default:
01985 errcpy(err, "unexpected escape sequence");
01986 return -1;
01987 }
01988 if (code < 0 || 0xff < code) {
01989 errcpy(err, "invalid escape code");
01990 return -1;
01991 }
01992
01993 if (ctrl_prefix)
01994 code &= 0x1f;
01995 if (meta_prefix)
01996 code |= 0x80;
01997
01998 *pp = p;
01999 return code;
02000 }
02001
02002 static int
02003 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02004 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02005 {
02006 const char *p = *pp;
02007 int chmaxlen = rb_enc_mbmaxlen(enc);
02008 char *chbuf = ALLOCA_N(char, chmaxlen);
02009 int chlen = 0;
02010 int byte;
02011 int l;
02012
02013 memset(chbuf, 0, chmaxlen);
02014
02015 byte = read_escaped_byte(&p, end, err);
02016 if (byte == -1) {
02017 return -1;
02018 }
02019
02020 chbuf[chlen++] = byte;
02021 while (chlen < chmaxlen &&
02022 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02023 byte = read_escaped_byte(&p, end, err);
02024 if (byte == -1) {
02025 return -1;
02026 }
02027 chbuf[chlen++] = byte;
02028 }
02029
02030 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02031 if (MBCLEN_INVALID_P(l)) {
02032 errcpy(err, "invalid multibyte escape");
02033 return -1;
02034 }
02035 if (1 < chlen || (chbuf[0] & 0x80)) {
02036 rb_str_buf_cat(buf, chbuf, chlen);
02037
02038 if (*encp == 0)
02039 *encp = enc;
02040 else if (*encp != enc) {
02041 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02042 return -1;
02043 }
02044 }
02045 else {
02046 char escbuf[5];
02047 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02048 rb_str_buf_cat(buf, escbuf, 4);
02049 }
02050 *pp = p;
02051 return 0;
02052 }
02053
02054 static int
02055 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02056 {
02057 if ((0xd800 <= code && code <= 0xdfff) ||
02058 0x10ffff < code) {
02059 errcpy(err, "invalid Unicode range");
02060 return -1;
02061 }
02062 return 0;
02063 }
02064
02065 static int
02066 append_utf8(unsigned long uv,
02067 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02068 {
02069 if (check_unicode_range(uv, err) != 0)
02070 return -1;
02071 if (uv < 0x80) {
02072 char escbuf[5];
02073 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02074 rb_str_buf_cat(buf, escbuf, 4);
02075 }
02076 else {
02077 int len;
02078 char utf8buf[6];
02079 len = rb_uv_to_utf8(utf8buf, uv);
02080 rb_str_buf_cat(buf, utf8buf, len);
02081
02082 if (*encp == 0)
02083 *encp = rb_utf8_encoding();
02084 else if (*encp != rb_utf8_encoding()) {
02085 errcpy(err, "UTF-8 character in non UTF-8 regexp");
02086 return -1;
02087 }
02088 }
02089 return 0;
02090 }
02091
02092 static int
02093 unescape_unicode_list(const char **pp, const char *end,
02094 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02095 {
02096 const char *p = *pp;
02097 int has_unicode = 0;
02098 unsigned long code;
02099 size_t len;
02100
02101 while (p < end && ISSPACE(*p)) p++;
02102
02103 while (1) {
02104 code = ruby_scan_hex(p, end-p, &len);
02105 if (len == 0)
02106 break;
02107 if (6 < len) {
02108 errcpy(err, "invalid Unicode range");
02109 return -1;
02110 }
02111 p += len;
02112 if (append_utf8(code, buf, encp, err) != 0)
02113 return -1;
02114 has_unicode = 1;
02115
02116 while (p < end && ISSPACE(*p)) p++;
02117 }
02118
02119 if (has_unicode == 0) {
02120 errcpy(err, "invalid Unicode list");
02121 return -1;
02122 }
02123
02124 *pp = p;
02125
02126 return 0;
02127 }
02128
02129 static int
02130 unescape_unicode_bmp(const char **pp, const char *end,
02131 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02132 {
02133 const char *p = *pp;
02134 size_t len;
02135 unsigned long code;
02136
02137 if (end < p+4) {
02138 errcpy(err, "invalid Unicode escape");
02139 return -1;
02140 }
02141 code = ruby_scan_hex(p, 4, &len);
02142 if (len != 4) {
02143 errcpy(err, "invalid Unicode escape");
02144 return -1;
02145 }
02146 if (append_utf8(code, buf, encp, err) != 0)
02147 return -1;
02148 *pp = p + 4;
02149 return 0;
02150 }
02151
02152 static int
02153 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02154 VALUE buf, rb_encoding **encp, int *has_property,
02155 onig_errmsg_buffer err)
02156 {
02157 char c;
02158 char smallbuf[2];
02159
02160 while (p < end) {
02161 int chlen = rb_enc_precise_mbclen(p, end, enc);
02162 if (!MBCLEN_CHARFOUND_P(chlen)) {
02163 errcpy(err, "invalid multibyte character");
02164 return -1;
02165 }
02166 chlen = MBCLEN_CHARFOUND_LEN(chlen);
02167 if (1 < chlen || (*p & 0x80)) {
02168 rb_str_buf_cat(buf, p, chlen);
02169 p += chlen;
02170 if (*encp == 0)
02171 *encp = enc;
02172 else if (*encp != enc) {
02173 errcpy(err, "non ASCII character in UTF-8 regexp");
02174 return -1;
02175 }
02176 continue;
02177 }
02178
02179 switch (c = *p++) {
02180 case '\\':
02181 if (p == end) {
02182 errcpy(err, "too short escape sequence");
02183 return -1;
02184 }
02185 switch (c = *p++) {
02186 case '1': case '2': case '3':
02187 case '4': case '5': case '6': case '7':
02188 {
02189 size_t octlen;
02190 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02191
02192
02193
02194 goto escape_asis;
02195 }
02196 }
02197
02198
02199 case '0':
02200
02201 case 'x':
02202 case 'c':
02203 case 'C':
02204 case 'M':
02205 p = p-2;
02206 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02207 return -1;
02208 break;
02209
02210 case 'u':
02211 if (p == end) {
02212 errcpy(err, "too short escape sequence");
02213 return -1;
02214 }
02215 if (*p == '{') {
02216
02217 p++;
02218 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02219 return -1;
02220 if (p == end || *p++ != '}') {
02221 errcpy(err, "invalid Unicode list");
02222 return -1;
02223 }
02224 break;
02225 }
02226 else {
02227
02228 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02229 return -1;
02230 break;
02231 }
02232
02233 case 'p':
02234 case 'P':
02235 if (!*encp) {
02236 *has_property = 1;
02237 }
02238 goto escape_asis;
02239
02240 default:
02241 escape_asis:
02242 smallbuf[0] = '\\';
02243 smallbuf[1] = c;
02244 rb_str_buf_cat(buf, smallbuf, 2);
02245 break;
02246 }
02247 break;
02248
02249 default:
02250 rb_str_buf_cat(buf, &c, 1);
02251 break;
02252 }
02253 }
02254
02255 return 0;
02256 }
02257
02258 static VALUE
02259 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02260 rb_encoding **fixed_enc, onig_errmsg_buffer err)
02261 {
02262 VALUE buf;
02263 int has_property = 0;
02264
02265 buf = rb_str_buf_new(0);
02266
02267 if (rb_enc_asciicompat(enc))
02268 *fixed_enc = 0;
02269 else {
02270 *fixed_enc = enc;
02271 rb_enc_associate(buf, enc);
02272 }
02273
02274 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02275 return Qnil;
02276
02277 if (has_property && !*fixed_enc) {
02278 *fixed_enc = enc;
02279 }
02280
02281 if (*fixed_enc) {
02282 rb_enc_associate(buf, *fixed_enc);
02283 }
02284
02285 return buf;
02286 }
02287
02288 VALUE
02289 rb_reg_check_preprocess(VALUE str)
02290 {
02291 rb_encoding *fixed_enc = 0;
02292 onig_errmsg_buffer err = "";
02293 VALUE buf;
02294 char *p, *end;
02295 rb_encoding *enc;
02296
02297 StringValue(str);
02298 p = RSTRING_PTR(str);
02299 end = p + RSTRING_LEN(str);
02300 enc = rb_enc_get(str);
02301
02302 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02303 RB_GC_GUARD(str);
02304
02305 if (buf == Qnil) {
02306 return rb_reg_error_desc(str, 0, err);
02307 }
02308 return Qnil;
02309 }
02310
02311 static VALUE
02312 rb_reg_preprocess_dregexp(VALUE ary, int options)
02313 {
02314 rb_encoding *fixed_enc = 0;
02315 rb_encoding *regexp_enc = 0;
02316 onig_errmsg_buffer err = "";
02317 int i;
02318 VALUE result = 0;
02319 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02320
02321 if (RARRAY_LEN(ary) == 0) {
02322 rb_raise(rb_eArgError, "no arguments given");
02323 }
02324
02325 for (i = 0; i < RARRAY_LEN(ary); i++) {
02326 VALUE str = RARRAY_PTR(ary)[i];
02327 VALUE buf;
02328 char *p, *end;
02329 rb_encoding *src_enc;
02330
02331 src_enc = rb_enc_get(str);
02332 if (options & ARG_ENCODING_NONE &&
02333 src_enc != ascii8bit) {
02334 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02335 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02336 else
02337 src_enc = ascii8bit;
02338 }
02339
02340 StringValue(str);
02341 p = RSTRING_PTR(str);
02342 end = p + RSTRING_LEN(str);
02343
02344 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02345
02346 if (buf == Qnil)
02347 rb_raise(rb_eArgError, "%s", err);
02348
02349 if (fixed_enc != 0) {
02350 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02351 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02352 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02353 }
02354 regexp_enc = fixed_enc;
02355 }
02356
02357 if (!result)
02358 result = rb_str_new3(str);
02359 else
02360 rb_str_buf_append(result, str);
02361 }
02362 if (regexp_enc) {
02363 rb_enc_associate(result, regexp_enc);
02364 }
02365
02366 return result;
02367 }
02368
02369 static int
02370 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02371 int options, onig_errmsg_buffer err,
02372 const char *sourcefile, int sourceline)
02373 {
02374 struct RRegexp *re = RREGEXP(obj);
02375 VALUE unescaped;
02376 rb_encoding *fixed_enc = 0;
02377 rb_encoding *a_enc = rb_ascii8bit_encoding();
02378
02379 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02380 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02381 rb_check_frozen(obj);
02382 if (FL_TEST(obj, REG_LITERAL))
02383 rb_raise(rb_eSecurityError, "can't modify literal regexp");
02384 if (re->ptr)
02385 rb_raise(rb_eTypeError, "already initialized regexp");
02386 re->ptr = 0;
02387
02388 if (rb_enc_dummy_p(enc)) {
02389 errcpy(err, "can't make regexp with dummy encoding");
02390 return -1;
02391 }
02392
02393 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02394 if (unescaped == Qnil)
02395 return -1;
02396
02397 if (fixed_enc) {
02398 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02399 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02400 errcpy(err, "incompatible character encoding");
02401 return -1;
02402 }
02403 if (fixed_enc != a_enc) {
02404 options |= ARG_ENCODING_FIXED;
02405 enc = fixed_enc;
02406 }
02407 }
02408 else if (!(options & ARG_ENCODING_FIXED)) {
02409 enc = rb_usascii_encoding();
02410 }
02411
02412 rb_enc_associate((VALUE)re, enc);
02413 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02414 re->basic.flags |= KCODE_FIXED;
02415 }
02416 if (options & ARG_ENCODING_NONE) {
02417 re->basic.flags |= REG_ENCODING_NONE;
02418 }
02419
02420 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02421 options & ARG_REG_OPTION_MASK, err,
02422 sourcefile, sourceline);
02423 if (!re->ptr) return -1;
02424 re->src = rb_enc_str_new(s, len, enc);
02425 OBJ_FREEZE(re->src);
02426 RB_GC_GUARD(unescaped);
02427 return 0;
02428 }
02429
02430 static int
02431 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02432 const char *sourcefile, int sourceline)
02433 {
02434 int ret;
02435 rb_encoding *enc = rb_enc_get(str);
02436 if (options & ARG_ENCODING_NONE) {
02437 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02438 if (enc != ascii8bit) {
02439 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02440 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02441 return -1;
02442 }
02443 enc = ascii8bit;
02444 }
02445 }
02446 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02447 options, err, sourcefile, sourceline);
02448 OBJ_INFECT(obj, str);
02449 RB_GC_GUARD(str);
02450 return ret;
02451 }
02452
02453 static VALUE
02454 rb_reg_s_alloc(VALUE klass)
02455 {
02456 NEWOBJ(re, struct RRegexp);
02457 OBJSETUP(re, klass, T_REGEXP);
02458
02459 re->ptr = 0;
02460 re->src = 0;
02461 re->usecnt = 0;
02462
02463 return (VALUE)re;
02464 }
02465
02466 VALUE
02467 rb_reg_alloc(void)
02468 {
02469 return rb_reg_s_alloc(rb_cRegexp);
02470 }
02471
02472 VALUE
02473 rb_reg_new_str(VALUE s, int options)
02474 {
02475 return rb_reg_init_str(rb_reg_alloc(), s, options);
02476 }
02477
02478 VALUE
02479 rb_reg_init_str(VALUE re, VALUE s, int options)
02480 {
02481 onig_errmsg_buffer err = "";
02482
02483 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02484 rb_reg_raise_str(s, options, err);
02485 }
02486
02487 return re;
02488 }
02489
02490 VALUE
02491 rb_reg_new_ary(VALUE ary, int opt)
02492 {
02493 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02494 }
02495
02496 VALUE
02497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02498 {
02499 VALUE re = rb_reg_alloc();
02500 onig_errmsg_buffer err = "";
02501
02502 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02503 rb_enc_reg_raise(s, len, enc, options, err);
02504 }
02505
02506 return re;
02507 }
02508
02509 VALUE
02510 rb_reg_new(const char *s, long len, int options)
02511 {
02512 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02513 }
02514
02515 VALUE
02516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02517 {
02518 VALUE re = rb_reg_alloc();
02519 onig_errmsg_buffer err = "";
02520
02521 if (!str) str = rb_str_new(0,0);
02522 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02523 rb_set_errinfo(rb_reg_error_desc(str, options, err));
02524 return Qnil;
02525 }
02526 FL_SET(re, REG_LITERAL);
02527 return re;
02528 }
02529
02530 static VALUE reg_cache;
02531
02532 VALUE
02533 rb_reg_regcomp(VALUE str)
02534 {
02535 volatile VALUE save_str = str;
02536 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02537 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02538 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02539 return reg_cache;
02540
02541 return reg_cache = rb_reg_new_str(save_str, 0);
02542 }
02543
02544 static st_index_t reg_hash(VALUE re);
02545
02546
02547
02548
02549
02550
02551
02552 static VALUE
02553 rb_reg_hash(VALUE re)
02554 {
02555 st_index_t hashval = reg_hash(re);
02556 return LONG2FIX(hashval);
02557 }
02558
02559 static st_index_t
02560 reg_hash(VALUE re)
02561 {
02562 st_index_t hashval;
02563
02564 rb_reg_check(re);
02565 hashval = RREGEXP(re)->ptr->options;
02566 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02567 return rb_hash_end(hashval);
02568 }
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586 static VALUE
02587 rb_reg_equal(VALUE re1, VALUE re2)
02588 {
02589 if (re1 == re2) return Qtrue;
02590 if (TYPE(re2) != T_REGEXP) return Qfalse;
02591 rb_reg_check(re1); rb_reg_check(re2);
02592 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02593 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02594 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02595 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02596 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02597 return Qtrue;
02598 }
02599 return Qfalse;
02600 }
02601
02602
02603
02604
02605
02606
02607
02608
02609
02610 static VALUE
02611 match_hash(VALUE match)
02612 {
02613 const struct re_registers *regs;
02614 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02615
02616 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02617 regs = RMATCH_REGS(match);
02618 hashval = rb_hash_uint(hashval, regs->num_regs);
02619 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02620 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02621 hashval = rb_hash_end(hashval);
02622 return LONG2FIX(hashval);
02623 }
02624
02625
02626
02627
02628
02629
02630
02631
02632
02633 static VALUE
02634 match_equal(VALUE match1, VALUE match2)
02635 {
02636 const struct re_registers *regs1, *regs2;
02637 if (match1 == match2) return Qtrue;
02638 if (TYPE(match2) != T_MATCH) return Qfalse;
02639 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02640 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02641 regs1 = RMATCH_REGS(match1);
02642 regs2 = RMATCH_REGS(match2);
02643 if (regs1->num_regs != regs2->num_regs) return Qfalse;
02644 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02645 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02646 return Qtrue;
02647 }
02648
02649 static VALUE
02650 reg_operand(VALUE s, int check)
02651 {
02652 if (SYMBOL_P(s)) {
02653 return rb_sym_to_s(s);
02654 }
02655 else {
02656 VALUE tmp = rb_check_string_type(s);
02657 if (check && NIL_P(tmp)) {
02658 rb_raise(rb_eTypeError, "can't convert %s to String",
02659 rb_obj_classname(s));
02660 }
02661 return tmp;
02662 }
02663 }
02664
02665 static long
02666 reg_match_pos(VALUE re, VALUE *strp, long pos)
02667 {
02668 VALUE str = *strp;
02669
02670 if (NIL_P(str)) {
02671 rb_backref_set(Qnil);
02672 return -1;
02673 }
02674 *strp = str = reg_operand(str, TRUE);
02675 if (pos != 0) {
02676 if (pos < 0) {
02677 VALUE l = rb_str_length(str);
02678 pos += NUM2INT(l);
02679 if (pos < 0) {
02680 return pos;
02681 }
02682 }
02683 pos = rb_str_offset(str, pos);
02684 }
02685 return rb_reg_search(re, str, pos, 0);
02686 }
02687
02688
02689
02690
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724
02725
02726
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736 VALUE
02737 rb_reg_match(VALUE re, VALUE str)
02738 {
02739 long pos = reg_match_pos(re, &str, 0);
02740 if (pos < 0) return Qnil;
02741 pos = rb_str_sublen(str, pos);
02742 return LONG2FIX(pos);
02743 }
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763 VALUE
02764 rb_reg_eqq(VALUE re, VALUE str)
02765 {
02766 long start;
02767
02768 str = reg_operand(str, FALSE);
02769 if (NIL_P(str)) {
02770 rb_backref_set(Qnil);
02771 return Qfalse;
02772 }
02773 start = rb_reg_search(re, str, 0, 0);
02774 if (start < 0) {
02775 return Qfalse;
02776 }
02777 return Qtrue;
02778 }
02779
02780
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792 VALUE
02793 rb_reg_match2(VALUE re)
02794 {
02795 long start;
02796 VALUE line = rb_lastline_get();
02797
02798 if (TYPE(line) != T_STRING) {
02799 rb_backref_set(Qnil);
02800 return Qnil;
02801 }
02802
02803 start = rb_reg_search(re, line, 0, 0);
02804 if (start < 0) {
02805 return Qnil;
02806 }
02807 start = rb_str_sublen(line, start);
02808 return LONG2FIX(start);
02809 }
02810
02811
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840 static VALUE
02841 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02842 {
02843 VALUE result, str, initpos;
02844 long pos;
02845
02846 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02847 pos = NUM2LONG(initpos);
02848 }
02849 else {
02850 pos = 0;
02851 }
02852
02853 pos = reg_match_pos(re, &str, pos);
02854 if (pos < 0) {
02855 rb_backref_set(Qnil);
02856 return Qnil;
02857 }
02858 result = rb_backref_get();
02859 rb_match_busy(result);
02860 if (!NIL_P(result) && rb_block_given_p()) {
02861 return rb_yield(result);
02862 }
02863 return result;
02864 }
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896 static VALUE
02897 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02898 {
02899 onig_errmsg_buffer err = "";
02900 int flags = 0;
02901 VALUE str;
02902 rb_encoding *enc;
02903 const char *ptr;
02904 long len;
02905
02906 if (argc == 0 || argc > 3) {
02907 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02908 }
02909 if (TYPE(argv[0]) == T_REGEXP) {
02910 VALUE re = argv[0];
02911
02912 if (argc > 1) {
02913 rb_warn("flags ignored");
02914 }
02915 rb_reg_check(re);
02916 flags = rb_reg_options(re);
02917 ptr = RREGEXP_SRC_PTR(re);
02918 len = RREGEXP_SRC_LEN(re);
02919 enc = rb_enc_get(re);
02920 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02921 str = rb_enc_str_new(ptr, len, enc);
02922 rb_reg_raise_str(str, flags, err);
02923 }
02924 }
02925 else {
02926 if (argc >= 2) {
02927 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02928 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02929 }
02930 enc = 0;
02931 if (argc == 3 && !NIL_P(argv[2])) {
02932 char *kcode = StringValuePtr(argv[2]);
02933 if (kcode[0] == 'n' || kcode[0] == 'N') {
02934 enc = rb_ascii8bit_encoding();
02935 flags |= ARG_ENCODING_NONE;
02936 }
02937 else {
02938 rb_warn("encoding option is ignored - %s", kcode);
02939 }
02940 }
02941 str = argv[0];
02942 ptr = StringValuePtr(str);
02943 if (enc
02944 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02945 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02946 rb_reg_raise_str(str, flags, err);
02947 }
02948 }
02949 return self;
02950 }
02951
02952 VALUE
02953 rb_reg_quote(VALUE str)
02954 {
02955 rb_encoding *enc = rb_enc_get(str);
02956 char *s, *send, *t;
02957 VALUE tmp;
02958 int c, clen;
02959 int ascii_only = rb_enc_str_asciionly_p(str);
02960
02961 s = RSTRING_PTR(str);
02962 send = s + RSTRING_LEN(str);
02963 while (s < send) {
02964 c = rb_enc_ascget(s, send, &clen, enc);
02965 if (c == -1) {
02966 s += mbclen(s, send, enc);
02967 continue;
02968 }
02969 switch (c) {
02970 case '[': case ']': case '{': case '}':
02971 case '(': case ')': case '|': case '-':
02972 case '*': case '.': case '\\':
02973 case '?': case '+': case '^': case '$':
02974 case ' ': case '#':
02975 case '\t': case '\f': case '\v': case '\n': case '\r':
02976 goto meta_found;
02977 }
02978 s += clen;
02979 }
02980 tmp = rb_str_new3(str);
02981 if (ascii_only) {
02982 rb_enc_associate(tmp, rb_usascii_encoding());
02983 }
02984 return tmp;
02985
02986 meta_found:
02987 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02988 if (ascii_only) {
02989 rb_enc_associate(tmp, rb_usascii_encoding());
02990 }
02991 else {
02992 rb_enc_copy(tmp, str);
02993 }
02994 t = RSTRING_PTR(tmp);
02995
02996 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02997 t += s - RSTRING_PTR(str);
02998
02999 while (s < send) {
03000 c = rb_enc_ascget(s, send, &clen, enc);
03001 if (c == -1) {
03002 int n = mbclen(s, send, enc);
03003
03004 while (n--)
03005 *t++ = *s++;
03006 continue;
03007 }
03008 s += clen;
03009 switch (c) {
03010 case '[': case ']': case '{': case '}':
03011 case '(': case ')': case '|': case '-':
03012 case '*': case '.': case '\\':
03013 case '?': case '+': case '^': case '$':
03014 case '#':
03015 t += rb_enc_mbcput('\\', t, enc);
03016 break;
03017 case ' ':
03018 t += rb_enc_mbcput('\\', t, enc);
03019 t += rb_enc_mbcput(' ', t, enc);
03020 continue;
03021 case '\t':
03022 t += rb_enc_mbcput('\\', t, enc);
03023 t += rb_enc_mbcput('t', t, enc);
03024 continue;
03025 case '\n':
03026 t += rb_enc_mbcput('\\', t, enc);
03027 t += rb_enc_mbcput('n', t, enc);
03028 continue;
03029 case '\r':
03030 t += rb_enc_mbcput('\\', t, enc);
03031 t += rb_enc_mbcput('r', t, enc);
03032 continue;
03033 case '\f':
03034 t += rb_enc_mbcput('\\', t, enc);
03035 t += rb_enc_mbcput('f', t, enc);
03036 continue;
03037 case '\v':
03038 t += rb_enc_mbcput('\\', t, enc);
03039 t += rb_enc_mbcput('v', t, enc);
03040 continue;
03041 }
03042 t += rb_enc_mbcput(c, t, enc);
03043 }
03044 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03045 OBJ_INFECT(tmp, str);
03046 return tmp;
03047 }
03048
03049
03050
03051
03052
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064 static VALUE
03065 rb_reg_s_quote(VALUE c, VALUE str)
03066 {
03067 return rb_reg_quote(reg_operand(str, TRUE));
03068 }
03069
03070 int
03071 rb_reg_options(VALUE re)
03072 {
03073 int options;
03074
03075 rb_reg_check(re);
03076 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03077 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03078 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03079 return options;
03080 }
03081
03082 VALUE
03083 rb_check_regexp_type(VALUE re)
03084 {
03085 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03086 }
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105 static VALUE
03106 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03107 {
03108 return rb_check_regexp_type(re);
03109 }
03110
03111 static VALUE
03112 rb_reg_s_union(VALUE self, VALUE args0)
03113 {
03114 long argc = RARRAY_LEN(args0);
03115
03116 if (argc == 0) {
03117 VALUE args[1];
03118 args[0] = rb_str_new2("(?!)");
03119 return rb_class_new_instance(1, args, rb_cRegexp);
03120 }
03121 else if (argc == 1) {
03122 VALUE arg = rb_ary_entry(args0, 0);
03123 VALUE re = rb_check_regexp_type(arg);
03124 if (!NIL_P(re))
03125 return re;
03126 else {
03127 VALUE quoted;
03128 quoted = rb_reg_s_quote(Qnil, arg);
03129 return rb_reg_new_str(quoted, 0);
03130 }
03131 }
03132 else {
03133 int i;
03134 VALUE source = rb_str_buf_new(0);
03135 rb_encoding *result_enc;
03136
03137 int has_asciionly = 0;
03138 rb_encoding *has_ascii_compat_fixed = 0;
03139 rb_encoding *has_ascii_incompat = 0;
03140
03141 for (i = 0; i < argc; i++) {
03142 volatile VALUE v;
03143 VALUE e = rb_ary_entry(args0, i);
03144
03145 if (0 < i)
03146 rb_str_buf_cat_ascii(source, "|");
03147
03148 v = rb_check_regexp_type(e);
03149 if (!NIL_P(v)) {
03150 rb_encoding *enc = rb_enc_get(v);
03151 if (!rb_enc_asciicompat(enc)) {
03152 if (!has_ascii_incompat)
03153 has_ascii_incompat = enc;
03154 else if (has_ascii_incompat != enc)
03155 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03156 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03157 }
03158 else if (rb_reg_fixed_encoding_p(v)) {
03159 if (!has_ascii_compat_fixed)
03160 has_ascii_compat_fixed = enc;
03161 else if (has_ascii_compat_fixed != enc)
03162 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03163 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03164 }
03165 else {
03166 has_asciionly = 1;
03167 }
03168 v = rb_reg_to_s(v);
03169 }
03170 else {
03171 rb_encoding *enc;
03172 StringValue(e);
03173 enc = rb_enc_get(e);
03174 if (!rb_enc_str_asciicompat_p(e)) {
03175 if (!has_ascii_incompat)
03176 has_ascii_incompat = enc;
03177 else if (has_ascii_incompat != enc)
03178 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03179 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03180 }
03181 else if (rb_enc_str_asciionly_p(e)) {
03182 has_asciionly = 1;
03183 }
03184 else {
03185 if (!has_ascii_compat_fixed)
03186 has_ascii_compat_fixed = enc;
03187 else if (has_ascii_compat_fixed != enc)
03188 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03189 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03190 }
03191 v = rb_reg_s_quote(Qnil, e);
03192 }
03193 if (has_ascii_incompat) {
03194 if (has_asciionly) {
03195 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03196 rb_enc_name(has_ascii_incompat));
03197 }
03198 if (has_ascii_compat_fixed) {
03199 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03200 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03201 }
03202 }
03203
03204 if (i == 0) {
03205 rb_enc_copy(source, v);
03206 }
03207 rb_str_append(source, v);
03208 }
03209
03210 if (has_ascii_incompat) {
03211 result_enc = has_ascii_incompat;
03212 }
03213 else if (has_ascii_compat_fixed) {
03214 result_enc = has_ascii_compat_fixed;
03215 }
03216 else {
03217 result_enc = rb_ascii8bit_encoding();
03218 }
03219
03220 rb_enc_associate(source, result_enc);
03221 return rb_class_new_instance(1, &source, rb_cRegexp);
03222 }
03223 }
03224
03225
03226
03227
03228
03229
03230
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243 static VALUE
03244 rb_reg_s_union_m(VALUE self, VALUE args)
03245 {
03246 VALUE v;
03247 if (RARRAY_LEN(args) == 1 &&
03248 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03249 return rb_reg_s_union(self, v);
03250 }
03251 return rb_reg_s_union(self, args);
03252 }
03253
03254
03255 static VALUE
03256 rb_reg_init_copy(VALUE copy, VALUE re)
03257 {
03258 onig_errmsg_buffer err = "";
03259 const char *s;
03260 long len;
03261
03262 if (copy == re) return copy;
03263 rb_check_frozen(copy);
03264
03265 if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03266 rb_raise(rb_eTypeError, "wrong argument type");
03267 }
03268 rb_reg_check(re);
03269 s = RREGEXP_SRC_PTR(re);
03270 len = RREGEXP_SRC_LEN(re);
03271 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03272 err, NULL, 0) != 0) {
03273 rb_reg_raise(s, len, err, re);
03274 }
03275 return copy;
03276 }
03277
03278 VALUE
03279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03280 {
03281 VALUE val = 0;
03282 char *p, *s, *e;
03283 int no, clen;
03284 rb_encoding *str_enc = rb_enc_get(str);
03285 rb_encoding *src_enc = rb_enc_get(src);
03286 int acompat = rb_enc_asciicompat(str_enc);
03287 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
03288
03289 p = s = RSTRING_PTR(str);
03290 e = s + RSTRING_LEN(str);
03291
03292 while (s < e) {
03293 int c = ASCGET(s, e, &clen);
03294 char *ss;
03295
03296 if (c == -1) {
03297 s += mbclen(s, e, str_enc);
03298 continue;
03299 }
03300 ss = s;
03301 s += clen;
03302
03303 if (c != '\\' || s == e) continue;
03304
03305 if (!val) {
03306 val = rb_str_buf_new(ss-p);
03307 }
03308 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03309
03310 c = ASCGET(s, e, &clen);
03311 if (c == -1) {
03312 s += mbclen(s, e, str_enc);
03313 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03314 p = s;
03315 continue;
03316 }
03317 s += clen;
03318
03319 p = s;
03320 switch (c) {
03321 case '1': case '2': case '3': case '4':
03322 case '5': case '6': case '7': case '8': case '9':
03323 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03324 no = c - '0';
03325 }
03326 else {
03327 continue;
03328 }
03329 break;
03330
03331 case 'k':
03332 if (s < e && ASCGET(s, e, &clen) == '<') {
03333 char *name, *name_end;
03334
03335 name_end = name = s + clen;
03336 while (name_end < e) {
03337 c = ASCGET(name_end, e, &clen);
03338 if (c == '>') break;
03339 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03340 }
03341 if (name_end < e) {
03342 no = name_to_backref_number(regs, regexp, name, name_end);
03343 p = s = name_end + clen;
03344 break;
03345 }
03346 else {
03347 rb_raise(rb_eRuntimeError, "invalid group name reference format");
03348 }
03349 }
03350
03351 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03352 continue;
03353
03354 case '0':
03355 case '&':
03356 no = 0;
03357 break;
03358
03359 case '`':
03360 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03361 continue;
03362
03363 case '\'':
03364 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03365 continue;
03366
03367 case '+':
03368 no = regs->num_regs-1;
03369 while (BEG(no) == -1 && no > 0) no--;
03370 if (no == 0) continue;
03371 break;
03372
03373 case '\\':
03374 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03375 continue;
03376
03377 default:
03378 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03379 continue;
03380 }
03381
03382 if (no >= 0) {
03383 if (no >= regs->num_regs) continue;
03384 if (BEG(no) == -1) continue;
03385 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03386 }
03387 }
03388
03389 if (!val) return str;
03390 if (p < e) {
03391 rb_enc_str_buf_cat(val, p, e-p, str_enc);
03392 }
03393
03394 return val;
03395 }
03396
03397 static VALUE
03398 kcode_getter(void)
03399 {
03400 rb_warn("variable $KCODE is no longer effective");
03401 return Qnil;
03402 }
03403
03404 static void
03405 kcode_setter(VALUE val, ID id)
03406 {
03407 rb_warn("variable $KCODE is no longer effective; ignored");
03408 }
03409
03410 static VALUE
03411 ignorecase_getter(void)
03412 {
03413 rb_warn("variable $= is no longer effective");
03414 return Qfalse;
03415 }
03416
03417 static void
03418 ignorecase_setter(VALUE val, ID id)
03419 {
03420 rb_warn("variable $= is no longer effective; ignored");
03421 }
03422
03423 static VALUE
03424 match_getter(void)
03425 {
03426 VALUE match = rb_backref_get();
03427
03428 if (NIL_P(match)) return Qnil;
03429 rb_match_busy(match);
03430 return match;
03431 }
03432
03433 static void
03434 match_setter(VALUE val)
03435 {
03436 if (!NIL_P(val)) {
03437 Check_Type(val, T_MATCH);
03438 }
03439 rb_backref_set(val);
03440 }
03441
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460
03461
03462
03463
03464
03465
03466
03467
03468 static VALUE
03469 rb_reg_s_last_match(int argc, VALUE *argv)
03470 {
03471 VALUE nth;
03472
03473 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03474 VALUE match = rb_backref_get();
03475 int n;
03476 if (NIL_P(match)) return Qnil;
03477 n = match_backref_number(match, nth);
03478 return rb_reg_nth_match(n, match);
03479 }
03480 return match_getter();
03481 }
03482
03483 static void
03484 re_warn(const char *s)
03485 {
03486 rb_warn("%s", s);
03487 }
03488
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512 void
03513 Init_Regexp(void)
03514 {
03515 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03516
03517 onigenc_set_default_caseconv_table((UChar*)casetable);
03518 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03519 onig_set_warn_func(re_warn);
03520 onig_set_verb_warn_func(re_warn);
03521
03522 rb_define_virtual_variable("$~", match_getter, match_setter);
03523 rb_define_virtual_variable("$&", last_match_getter, 0);
03524 rb_define_virtual_variable("$`", prematch_getter, 0);
03525 rb_define_virtual_variable("$'", postmatch_getter, 0);
03526 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03527
03528 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03529 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03530 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03531
03532 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03533 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03534 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03535 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03536 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03537 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03538 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03539 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03540
03541 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03542 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03543 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03544 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03545 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03546 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03547 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03548 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03549 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03550 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03551 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03552 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03553 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03554 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03555 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0);
03556 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03557 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03558 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03559
03560
03561 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03562
03563 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03564
03565 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03566
03567 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03568
03569 rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
03570
03571 rb_global_variable(®_cache);
03572
03573 rb_cMatch = rb_define_class("MatchData", rb_cObject);
03574 rb_define_alloc_func(rb_cMatch, match_alloc);
03575 rb_undef_method(CLASS_OF(rb_cMatch), "new");
03576
03577 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03578 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03579 rb_define_method(rb_cMatch, "names", match_names, 0);
03580 rb_define_method(rb_cMatch, "size", match_size, 0);
03581 rb_define_method(rb_cMatch, "length", match_size, 0);
03582 rb_define_method(rb_cMatch, "offset", match_offset, 1);
03583 rb_define_method(rb_cMatch, "begin", match_begin, 1);
03584 rb_define_method(rb_cMatch, "end", match_end, 1);
03585 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03586 rb_define_method(rb_cMatch, "[]", match_aref, -1);
03587 rb_define_method(rb_cMatch, "captures", match_captures, 0);
03588 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03589 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03590 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03591 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03592 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03593 rb_define_method(rb_cMatch, "string", match_string, 0);
03594 rb_define_method(rb_cMatch, "hash", match_hash, 0);
03595 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03596 rb_define_method(rb_cMatch, "==", match_equal, 1);
03597 }
03598