00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022
00023 #include <math.h>
00024 #include <ctype.h>
00025
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048
00049 static VALUE rb_str_clear(VALUE str);
00050
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED FL_USER2
00058 #define STR_ASSOC FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066
00067
00068 #define STR_SET_NOEMBED(str) do {\
00069 FL_SET((str), STR_NOEMBED);\
00070 STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075 long tmp_n = (n);\
00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079
00080 #define STR_SET_LEN(str, n) do { \
00081 if (STR_EMBED_P(str)) {\
00082 STR_SET_EMBED_LEN((str), (n));\
00083 }\
00084 else {\
00085 RSTRING(str)->as.heap.len = (n);\
00086 }\
00087 } while (0)
00088
00089 #define STR_DEC_LEN(str) do {\
00090 if (STR_EMBED_P(str)) {\
00091 long n = RSTRING_LEN(str);\
00092 n--;\
00093 STR_SET_EMBED_LEN((str), n);\
00094 }\
00095 else {\
00096 RSTRING(str)->as.heap.len--;\
00097 }\
00098 } while (0)
00099
00100 #define RESIZE_CAPA(str,capacity) do {\
00101 if (STR_EMBED_P(str)) {\
00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103 char *tmp = ALLOC_N(char, (capacity)+1);\
00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105 RSTRING(str)->as.heap.ptr = tmp;\
00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107 STR_SET_NOEMBED(str);\
00108 RSTRING(str)->as.heap.aux.capa = (capacity);\
00109 }\
00110 }\
00111 else {\
00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113 if (!STR_NOCAPA_P(str))\
00114 RSTRING(str)->as.heap.aux.capa = (capacity);\
00115 }\
00116 } while (0)
00117
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126 rb_encoding *enc;
00127
00128
00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130 return 1;
00131
00132 enc = STR_ENC_GET(str);
00133 if (rb_enc_mbmaxlen(enc) == 1)
00134 return 1;
00135
00136
00137
00138 return 0;
00139 }
00140
00141 VALUE rb_fs;
00142
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152 if ((int)sizeof(VALUE) * 2 < e - p) {
00153 const VALUE *s, *t;
00154 const VALUE lowbits = sizeof(VALUE) - 1;
00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156 while (p < (const char *)s) {
00157 if (!ISASCII(*p))
00158 return p;
00159 p++;
00160 }
00161 t = (const VALUE*)(~lowbits & (VALUE)e);
00162 while (s < t) {
00163 if (*s & NONASCII_MASK) {
00164 t = s;
00165 break;
00166 }
00167 s++;
00168 }
00169 p = (const char *)t;
00170 }
00171 #endif
00172 while (p < e) {
00173 if (!ISASCII(*p))
00174 return p;
00175 p++;
00176 }
00177 return NULL;
00178 }
00179
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183 const char *e = p + len;
00184
00185 if (rb_enc_to_index(enc) == 0) {
00186
00187 p = search_nonascii(p, e);
00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189 }
00190
00191 if (rb_enc_asciicompat(enc)) {
00192 p = search_nonascii(p, e);
00193 if (!p) {
00194 return ENC_CODERANGE_7BIT;
00195 }
00196 while (p < e) {
00197 int ret = rb_enc_precise_mbclen(p, e, enc);
00198 if (!MBCLEN_CHARFOUND_P(ret)) {
00199 return ENC_CODERANGE_BROKEN;
00200 }
00201 p += MBCLEN_CHARFOUND_LEN(ret);
00202 if (p < e) {
00203 p = search_nonascii(p, e);
00204 if (!p) {
00205 return ENC_CODERANGE_VALID;
00206 }
00207 }
00208 }
00209 if (e < p) {
00210 return ENC_CODERANGE_BROKEN;
00211 }
00212 return ENC_CODERANGE_VALID;
00213 }
00214
00215 while (p < e) {
00216 int ret = rb_enc_precise_mbclen(p, e, enc);
00217
00218 if (!MBCLEN_CHARFOUND_P(ret)) {
00219 return ENC_CODERANGE_BROKEN;
00220 }
00221 p += MBCLEN_CHARFOUND_LEN(ret);
00222 }
00223 if (e < p) {
00224 return ENC_CODERANGE_BROKEN;
00225 }
00226 return ENC_CODERANGE_VALID;
00227 }
00228
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232 const char *p = s;
00233
00234 if (*cr == ENC_CODERANGE_BROKEN)
00235 return e - s;
00236
00237 if (rb_enc_to_index(enc) == 0) {
00238
00239 p = search_nonascii(p, e);
00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241 return e - s;
00242 }
00243 else if (rb_enc_asciicompat(enc)) {
00244 p = search_nonascii(p, e);
00245 if (!p) {
00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247 return e - s;
00248 }
00249 while (p < e) {
00250 int ret = rb_enc_precise_mbclen(p, e, enc);
00251 if (!MBCLEN_CHARFOUND_P(ret)) {
00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253 return p - s;
00254 }
00255 p += MBCLEN_CHARFOUND_LEN(ret);
00256 if (p < e) {
00257 p = search_nonascii(p, e);
00258 if (!p) {
00259 *cr = ENC_CODERANGE_VALID;
00260 return e - s;
00261 }
00262 }
00263 }
00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265 return p - s;
00266 }
00267 else {
00268 while (p < e) {
00269 int ret = rb_enc_precise_mbclen(p, e, enc);
00270 if (!MBCLEN_CHARFOUND_P(ret)) {
00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272 return p - s;
00273 }
00274 p += MBCLEN_CHARFOUND_LEN(ret);
00275 }
00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277 return p - s;
00278 }
00279 }
00280
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284 rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290
00291
00292
00293 str_enc_copy(dest, src);
00294 switch (ENC_CODERANGE(src)) {
00295 case ENC_CODERANGE_7BIT:
00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297 break;
00298 case ENC_CODERANGE_VALID:
00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302 else
00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304 break;
00305 default:
00306 if (RSTRING_LEN(dest) == 0) {
00307 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309 else
00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311 }
00312 break;
00313 }
00314 }
00315
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319 str_enc_copy(dest, src);
00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326 int cr = ENC_CODERANGE(str);
00327
00328 if (cr == ENC_CODERANGE_UNKNOWN) {
00329 rb_encoding *enc = STR_ENC_GET(str);
00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331 ENC_CODERANGE_SET(str, cr);
00332 }
00333 return cr;
00334 }
00335
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339 rb_encoding *enc = STR_ENC_GET(str);
00340
00341 if (!rb_enc_asciicompat(enc))
00342 return FALSE;
00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344 return TRUE;
00345 return FALSE;
00346 }
00347
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352 rb_raise(rb_eRuntimeError, "string modified");
00353 }
00354 }
00355
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359 if (STR_EMBED_P(str)) {
00360 return RSTRING_EMBED_LEN_MAX;
00361 }
00362 else if (STR_NOCAPA_P(str)) {
00363 return RSTRING(str)->as.heap.len;
00364 }
00365 else {
00366 return RSTRING(str)->as.heap.aux.capa;
00367 }
00368 }
00369
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373 NEWOBJ(str, struct RString);
00374 OBJSETUP(str, klass, T_STRING);
00375
00376 str->as.heap.ptr = 0;
00377 str->as.heap.len = 0;
00378 str->as.heap.aux.capa = 0;
00379
00380 return (VALUE)str;
00381 }
00382
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386 VALUE str;
00387
00388 if (len < 0) {
00389 rb_raise(rb_eArgError, "negative string size (or size too big)");
00390 }
00391
00392 str = str_alloc(klass);
00393 if (len > RSTRING_EMBED_LEN_MAX) {
00394 RSTRING(str)->as.heap.aux.capa = len;
00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396 STR_SET_NOEMBED(str);
00397 }
00398 else if (len == 0) {
00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400 }
00401 if (ptr) {
00402 memcpy(RSTRING_PTR(str), ptr, len);
00403 }
00404 STR_SET_LEN(str, len);
00405 RSTRING_PTR(str)[len] = '\0';
00406 return str;
00407 }
00408
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412 return str_new(rb_cString, ptr, len);
00413 }
00414
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418 VALUE str = rb_str_new(ptr, len);
00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420 return str;
00421 }
00422
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426 VALUE str = rb_str_new(ptr, len);
00427 rb_enc_associate(str, enc);
00428 return str;
00429 }
00430
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434 if (!ptr) {
00435 rb_raise(rb_eArgError, "NULL pointer given");
00436 }
00437 return rb_str_new(ptr, strlen(ptr));
00438 }
00439
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446 VALUE str = rb_str_new2(ptr);
00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448 return str;
00449 }
00450
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457 VALUE str = rb_str_new(ptr, len);
00458
00459 OBJ_TAINT(str);
00460 return str;
00461 }
00462
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466 VALUE str = rb_str_new2(ptr);
00467
00468 OBJ_TAINT(str);
00469 return str;
00470 }
00471
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478 rb_econv_t *ec;
00479 rb_econv_result_t ret;
00480 long len;
00481 VALUE newstr;
00482 const unsigned char *sp;
00483 unsigned char *dp;
00484
00485 if (!to) return str;
00486 if (from == to) return str;
00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488 to == rb_ascii8bit_encoding()) {
00489 if (STR_ENC_GET(str) != to) {
00490 str = rb_str_dup(str);
00491 rb_enc_associate(str, to);
00492 }
00493 return str;
00494 }
00495
00496 len = RSTRING_LEN(str);
00497 newstr = rb_str_new(0, len);
00498
00499 retry:
00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501 if (!ec) return str;
00502
00503 sp = (unsigned char*)RSTRING_PTR(str);
00504 dp = (unsigned char*)RSTRING_PTR(newstr);
00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507 rb_econv_close(ec);
00508 switch (ret) {
00509 case econv_destination_buffer_full:
00510
00511 len = len < 2 ? 2 : len * 2;
00512 rb_str_resize(newstr, len);
00513 goto retry;
00514
00515 case econv_finished:
00516 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517 rb_str_set_len(newstr, len);
00518 rb_enc_associate(newstr, to);
00519 return newstr;
00520
00521 default:
00522
00523 return str;
00524 }
00525 }
00526
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536 VALUE str;
00537
00538 str = rb_tainted_str_new(ptr, len);
00539 if (eenc == rb_usascii_encoding() &&
00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541 rb_enc_associate(str, rb_ascii8bit_encoding());
00542 return str;
00543 }
00544 rb_enc_associate(str, eenc);
00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606 STR_SET_EMBED(str2);
00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609 }
00610 else {
00611 str = rb_str_new_frozen(str);
00612 FL_SET(str2, STR_NOEMBED);
00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615 RSTRING(str2)->as.heap.aux.shared = str;
00616 FL_SET(str2, ELTS_SHARED);
00617 }
00618 rb_enc_cr_str_exact_copy(str2, str);
00619
00620 return str2;
00621 }
00622
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626 return str_replace_shared(str_alloc(klass), str);
00627 }
00628
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632 return str_new_shared(klass, str);
00633 }
00634
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638 VALUE str2 = str_new3(rb_obj_class(str), str);
00639
00640 OBJ_INFECT(str2, str);
00641 return str2;
00642 }
00643
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650 VALUE str2;
00651
00652 str2 = str_alloc(klass);
00653 STR_SET_NOEMBED(str2);
00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656 if (STR_SHARED_P(str)) {
00657 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658 assert(OBJ_FROZEN(shared));
00659 FL_SET(str2, ELTS_SHARED);
00660 RSTRING(str2)->as.heap.aux.shared = shared;
00661 }
00662 else {
00663 FL_SET(str, ELTS_SHARED);
00664 RSTRING(str)->as.heap.aux.shared = str2;
00665 }
00666 rb_enc_cr_str_exact_copy(str2, str);
00667 OBJ_INFECT(str2, str);
00668 return str2;
00669 }
00670
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674 VALUE klass, str;
00675
00676 if (OBJ_FROZEN(orig)) return orig;
00677 klass = rb_obj_class(orig);
00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679 long ofs;
00680 assert(OBJ_FROZEN(str));
00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684 ENCODING_GET(str) != ENCODING_GET(orig)) {
00685 str = str_new3(klass, str);
00686 RSTRING(str)->as.heap.ptr += ofs;
00687 RSTRING(str)->as.heap.len -= ofs;
00688 rb_enc_cr_str_exact_copy(str, orig);
00689 OBJ_INFECT(str, orig);
00690 }
00691 }
00692 else if (STR_EMBED_P(orig)) {
00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694 rb_enc_cr_str_exact_copy(str, orig);
00695 OBJ_INFECT(str, orig);
00696 }
00697 else if (STR_ASSOC_P(orig)) {
00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699 FL_UNSET(orig, STR_ASSOC);
00700 str = str_new4(klass, orig);
00701 FL_SET(str, STR_ASSOC);
00702 RSTRING(str)->as.heap.aux.shared = assoc;
00703 }
00704 else {
00705 str = str_new4(klass, orig);
00706 }
00707 OBJ_FREEZE(str);
00708 return str;
00709 }
00710
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717 return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721 rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727 VALUE v = rb_str_new5(str, 0, 0);
00728 rb_enc_copy(v, str);
00729 OBJ_INFECT(v, str);
00730 return v;
00731 }
00732
00733 #define STR_BUF_MIN_SIZE 128
00734
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738 VALUE str = str_alloc(rb_cString);
00739
00740 if (capa < STR_BUF_MIN_SIZE) {
00741 capa = STR_BUF_MIN_SIZE;
00742 }
00743 FL_SET(str, STR_NOEMBED);
00744 RSTRING(str)->as.heap.aux.capa = capa;
00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746 RSTRING(str)->as.heap.ptr[0] = '\0';
00747
00748 return str;
00749 }
00750
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754 VALUE str;
00755 long len = strlen(ptr);
00756
00757 str = rb_str_buf_new(len);
00758 rb_str_buf_cat(str, ptr, len);
00759
00760 return str;
00761 }
00762
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769 return str_new(0, 0, len);
00770 }
00771
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775 VALUE s = rb_str_tmp_new(len);
00776 *store = s;
00777 return RSTRING_PTR(s);
00778 }
00779
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783 VALUE s = *store;
00784 *store = 0;
00785 if (s) rb_str_clear(s);
00786 }
00787
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792 xfree(RSTRING(str)->as.heap.ptr);
00793 }
00794 }
00795
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800 return RSTRING(str)->as.heap.aux.capa;
00801 }
00802 else {
00803 return 0;
00804 }
00805 }
00806
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810 return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812
00813 static inline void str_discard(VALUE str);
00814
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818 rb_encoding *enc;
00819 int cr;
00820 if (str == str2) return;
00821 enc = STR_ENC_GET(str2);
00822 cr = ENC_CODERANGE(str2);
00823 str_discard(str);
00824 OBJ_INFECT(str, str2);
00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826 STR_SET_EMBED(str);
00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829 rb_enc_associate(str, enc);
00830 ENC_CODERANGE_SET(str, cr);
00831 return;
00832 }
00833 STR_SET_NOEMBED(str);
00834 STR_UNSET_NOCAPA(str);
00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837 if (STR_NOCAPA_P(str2)) {
00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840 }
00841 else {
00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843 }
00844 STR_SET_EMBED(str2);
00845 RSTRING_PTR(str2)[0] = 0;
00846 STR_SET_EMBED_LEN(str2, 0);
00847 rb_enc_associate(str, enc);
00848 ENC_CODERANGE_SET(str, cr);
00849 }
00850
00851 static ID id_to_s;
00852
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856 VALUE str;
00857
00858 if (TYPE(obj) == T_STRING) {
00859 return obj;
00860 }
00861 str = rb_funcall(obj, id_to_s, 0);
00862 if (TYPE(str) != T_STRING)
00863 return rb_any_to_s(obj);
00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865 return str;
00866 }
00867
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871 long len;
00872
00873 len = RSTRING_LEN(str2);
00874 if (STR_ASSOC_P(str2)) {
00875 str2 = rb_str_new4(str2);
00876 }
00877 if (STR_SHARED_P(str2)) {
00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879 assert(OBJ_FROZEN(shared));
00880 STR_SET_NOEMBED(str);
00881 RSTRING(str)->as.heap.len = len;
00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883 FL_SET(str, ELTS_SHARED);
00884 FL_UNSET(str, STR_ASSOC);
00885 RSTRING(str)->as.heap.aux.shared = shared;
00886 }
00887 else {
00888 str_replace_shared(str, str2);
00889 }
00890
00891 OBJ_INFECT(str, str2);
00892 rb_enc_cr_str_exact_copy(str, str2);
00893 return str;
00894 }
00895
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899 VALUE dup = str_alloc(klass);
00900 str_replace(dup, str);
00901 return dup;
00902 }
00903
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907 return str_duplicate(rb_obj_class(str), str);
00908 }
00909
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913 return str_replace(str_alloc(rb_cString), str);
00914 }
00915
00916
00917
00918
00919
00920
00921
00922
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926 VALUE orig;
00927
00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929 rb_str_replace(str, orig);
00930 return str;
00931 }
00932
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936 long c;
00937 const char *q;
00938
00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941 }
00942 else if (rb_enc_asciicompat(enc)) {
00943 c = 0;
00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945 while (p < e) {
00946 if (ISASCII(*p)) {
00947 q = search_nonascii(p, e);
00948 if (!q)
00949 return c + (e - p);
00950 c += q - p;
00951 p = q;
00952 }
00953 p += rb_enc_fast_mbclen(p, e, enc);
00954 c++;
00955 }
00956 }
00957 else {
00958 while (p < e) {
00959 if (ISASCII(*p)) {
00960 q = search_nonascii(p, e);
00961 if (!q)
00962 return c + (e - p);
00963 c += q - p;
00964 p = q;
00965 }
00966 p += rb_enc_mbclen(p, e, enc);
00967 c++;
00968 }
00969 }
00970 return c;
00971 }
00972
00973 for (c=0; p<e; c++) {
00974 p += rb_enc_mbclen(p, e, enc);
00975 }
00976 return c;
00977 }
00978
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988 long c;
00989 const char *q;
00990 int ret;
00991
00992 *cr = 0;
00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995 }
00996 else if (rb_enc_asciicompat(enc)) {
00997 c = 0;
00998 while (p < e) {
00999 if (ISASCII(*p)) {
01000 q = search_nonascii(p, e);
01001 if (!q) {
01002 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003 return c + (e - p);
01004 }
01005 c += q - p;
01006 p = q;
01007 }
01008 ret = rb_enc_precise_mbclen(p, e, enc);
01009 if (MBCLEN_CHARFOUND_P(ret)) {
01010 *cr |= ENC_CODERANGE_VALID;
01011 p += MBCLEN_CHARFOUND_LEN(ret);
01012 }
01013 else {
01014 *cr = ENC_CODERANGE_BROKEN;
01015 p++;
01016 }
01017 c++;
01018 }
01019 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020 return c;
01021 }
01022
01023 for (c=0; p<e; c++) {
01024 ret = rb_enc_precise_mbclen(p, e, enc);
01025 if (MBCLEN_CHARFOUND_P(ret)) {
01026 *cr |= ENC_CODERANGE_VALID;
01027 p += MBCLEN_CHARFOUND_LEN(ret);
01028 }
01029 else {
01030 *cr = ENC_CODERANGE_BROKEN;
01031 if (p + rb_enc_mbminlen(enc) <= e)
01032 p += rb_enc_mbminlen(enc);
01033 else
01034 p = e;
01035 }
01036 }
01037 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038 return c;
01039 }
01040
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043
01044
01045
01046
01047
01048
01049
01050
01051
01052
01053
01054
01055
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059 VALUE d = *s;
01060
01061
01062 d |= ~(d>>1);
01063 d >>= 6;
01064 d &= NONASCII_MASK >> 7;
01065
01066
01067 d += (d>>8);
01068 d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070 d += (d>>32);
01071 #endif
01072 return (d&0xF);
01073 }
01074 #endif
01075
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079 const char *p, *e;
01080 long n;
01081 int cr;
01082
01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084 if (!enc) enc = STR_ENC_GET(str);
01085 p = RSTRING_PTR(str);
01086 e = RSTRING_END(str);
01087 cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090 enc == rb_utf8_encoding()) {
01091
01092 VALUE len = 0;
01093 if ((int)sizeof(VALUE) * 2 < e - p) {
01094 const VALUE *s, *t;
01095 const VALUE lowbits = sizeof(VALUE) - 1;
01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097 t = (const VALUE*)(~lowbits & (VALUE)e);
01098 while (p < (const char *)s) {
01099 if (is_utf8_lead_byte(*p)) len++;
01100 p++;
01101 }
01102 while (s < t) {
01103 len += count_utf8_lead_bytes_with_word(s);
01104 s++;
01105 }
01106 p = (const char *)s;
01107 }
01108 while (p < e) {
01109 if (is_utf8_lead_byte(*p)) len++;
01110 p++;
01111 }
01112 return (long)len;
01113 }
01114 #endif
01115 n = rb_enc_strlen_cr(p, e, enc, &cr);
01116 if (cr) {
01117 ENC_CODERANGE_SET(str, cr);
01118 }
01119 return n;
01120 }
01121
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125 return str_strlen(str, STR_ENC_GET(str));
01126 }
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139 long len;
01140
01141 len = str_strlen(str, STR_ENC_GET(str));
01142 return LONG2NUM(len);
01143 }
01144
01145
01146
01147
01148
01149
01150
01151
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155 return LONG2NUM(RSTRING_LEN(str));
01156 }
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171 if (RSTRING_LEN(str) == 0)
01172 return Qtrue;
01173 return Qfalse;
01174 }
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189 VALUE str3;
01190 rb_encoding *enc;
01191
01192 StringValue(str2);
01193 enc = rb_enc_check(str1, str2);
01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197 RSTRING_PTR(str2), RSTRING_LEN(str2));
01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199
01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201 OBJ_TAINT(str3);
01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204 return str3;
01205 }
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220 VALUE str2;
01221 long n, len;
01222 char *ptr2;
01223
01224 len = NUM2LONG(times);
01225 if (len < 0) {
01226 rb_raise(rb_eArgError, "negative argument");
01227 }
01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01229 rb_raise(rb_eArgError, "argument too big");
01230 }
01231
01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233 ptr2 = RSTRING_PTR(str2);
01234 if (len) {
01235 n = RSTRING_LEN(str);
01236 memcpy(ptr2, RSTRING_PTR(str), n);
01237 while (n <= len/2) {
01238 memcpy(ptr2 + n, ptr2, n);
01239 n *= 2;
01240 }
01241 memcpy(ptr2 + n, ptr2, len-n);
01242 }
01243 ptr2[RSTRING_LEN(str2)] = '\0';
01244 OBJ_INFECT(str2, str);
01245 rb_enc_cr_str_copy_for_substr(str2, str);
01246
01247 return str2;
01248 }
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268 volatile VALUE tmp = rb_check_array_type(arg);
01269
01270 if (!NIL_P(tmp)) {
01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272 }
01273 return rb_str_format(1, &arg, str);
01274 }
01275
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279 if (FL_TEST(str, STR_TMPLOCK)) {
01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281 }
01282 rb_check_frozen(str);
01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290 str_modifiable(str);
01291 if (!STR_SHARED_P(str)) return 1;
01292 if (STR_EMBED_P(str)) return 1;
01293 return 0;
01294 }
01295
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299 char *ptr;
01300 long len = RSTRING_LEN(str);
01301 long capa = len + expand;
01302
01303 if (len > capa) len = capa;
01304 ptr = ALLOC_N(char, capa + 1);
01305 if (RSTRING_PTR(str)) {
01306 memcpy(ptr, RSTRING_PTR(str), len);
01307 }
01308 STR_SET_NOEMBED(str);
01309 STR_UNSET_NOCAPA(str);
01310 ptr[len] = 0;
01311 RSTRING(str)->as.heap.ptr = ptr;
01312 RSTRING(str)->as.heap.len = len;
01313 RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321 if (!str_independent(str))
01322 str_make_independent(str);
01323 ENC_CODERANGE_CLEAR(str);
01324 }
01325
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329 if (expand < 0) {
01330 rb_raise(rb_eArgError, "negative expanding string size");
01331 }
01332 if (!str_independent(str)) {
01333 str_make_independent_expand(str, expand);
01334 }
01335 else if (expand > 0) {
01336 long len = RSTRING_LEN(str);
01337 long capa = len + expand;
01338 if (!STR_EMBED_P(str)) {
01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340 STR_UNSET_NOCAPA(str);
01341 RSTRING(str)->as.heap.aux.capa = capa;
01342 }
01343 else if (capa > RSTRING_EMBED_LEN_MAX) {
01344 str_make_independent_expand(str, expand);
01345 }
01346 }
01347 ENC_CODERANGE_CLEAR(str);
01348 }
01349
01350
01351 static void
01352 str_modify_keep_cr(VALUE str)
01353 {
01354 if (!str_independent(str))
01355 str_make_independent(str);
01356 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01357
01358 ENC_CODERANGE_CLEAR(str);
01359 }
01360
01361 static inline void
01362 str_discard(VALUE str)
01363 {
01364 str_modifiable(str);
01365 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01366 xfree(RSTRING_PTR(str));
01367 RSTRING(str)->as.heap.ptr = 0;
01368 RSTRING(str)->as.heap.len = 0;
01369 }
01370 }
01371
01372 void
01373 rb_str_associate(VALUE str, VALUE add)
01374 {
01375
01376 rb_check_frozen(str);
01377 if (STR_ASSOC_P(str)) {
01378
01379 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01380 }
01381 else {
01382 if (STR_SHARED_P(str)) {
01383 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01384 str_make_independent(str);
01385 if (STR_ASSOC_P(assoc)) {
01386 assoc = RSTRING(assoc)->as.heap.aux.shared;
01387 rb_ary_concat(assoc, add);
01388 add = assoc;
01389 }
01390 }
01391 else if (STR_EMBED_P(str)) {
01392 str_make_independent(str);
01393 }
01394 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01395 RESIZE_CAPA(str, RSTRING_LEN(str));
01396 }
01397 FL_SET(str, STR_ASSOC);
01398 RBASIC(add)->klass = 0;
01399 RSTRING(str)->as.heap.aux.shared = add;
01400 }
01401 }
01402
01403 VALUE
01404 rb_str_associated(VALUE str)
01405 {
01406 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01407 if (STR_ASSOC_P(str)) {
01408 return RSTRING(str)->as.heap.aux.shared;
01409 }
01410 return Qfalse;
01411 }
01412
01413 VALUE
01414 rb_string_value(volatile VALUE *ptr)
01415 {
01416 VALUE s = *ptr;
01417 if (TYPE(s) != T_STRING) {
01418 s = rb_str_to_str(s);
01419 *ptr = s;
01420 }
01421 return s;
01422 }
01423
01424 char *
01425 rb_string_value_ptr(volatile VALUE *ptr)
01426 {
01427 VALUE str = rb_string_value(ptr);
01428 return RSTRING_PTR(str);
01429 }
01430
01431 char *
01432 rb_string_value_cstr(volatile VALUE *ptr)
01433 {
01434 VALUE str = rb_string_value(ptr);
01435 char *s = RSTRING_PTR(str);
01436 long len = RSTRING_LEN(str);
01437
01438 if (!s || memchr(s, 0, len)) {
01439 rb_raise(rb_eArgError, "string contains null byte");
01440 }
01441 if (s[len]) {
01442 rb_str_modify(str);
01443 s = RSTRING_PTR(str);
01444 s[RSTRING_LEN(str)] = 0;
01445 }
01446 return s;
01447 }
01448
01449 VALUE
01450 rb_check_string_type(VALUE str)
01451 {
01452 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01453 return str;
01454 }
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466
01467 static VALUE
01468 rb_str_s_try_convert(VALUE dummy, VALUE str)
01469 {
01470 return rb_check_string_type(str);
01471 }
01472
01473 static char*
01474 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01475 {
01476 long nth = *nthp;
01477 if (rb_enc_mbmaxlen(enc) == 1) {
01478 p += nth;
01479 }
01480 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01481 p += nth * rb_enc_mbmaxlen(enc);
01482 }
01483 else if (rb_enc_asciicompat(enc)) {
01484 const char *p2, *e2;
01485 int n;
01486
01487 while (p < e && 0 < nth) {
01488 e2 = p + nth;
01489 if (e < e2) {
01490 *nthp = nth;
01491 return (char *)e;
01492 }
01493 if (ISASCII(*p)) {
01494 p2 = search_nonascii(p, e2);
01495 if (!p2) {
01496 *nthp = nth;
01497 return (char *)e2;
01498 }
01499 nth -= p2 - p;
01500 p = p2;
01501 }
01502 n = rb_enc_mbclen(p, e, enc);
01503 p += n;
01504 nth--;
01505 }
01506 *nthp = nth;
01507 if (nth != 0) {
01508 return (char *)e;
01509 }
01510 return (char *)p;
01511 }
01512 else {
01513 while (p < e && nth--) {
01514 p += rb_enc_mbclen(p, e, enc);
01515 }
01516 }
01517 if (p > e) p = e;
01518 *nthp = nth;
01519 return (char*)p;
01520 }
01521
01522 char*
01523 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01524 {
01525 return str_nth_len(p, e, &nth, enc);
01526 }
01527
01528 static char*
01529 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01530 {
01531 if (singlebyte)
01532 p += nth;
01533 else {
01534 p = str_nth_len(p, e, &nth, enc);
01535 }
01536 if (!p) return 0;
01537 if (p > e) p = e;
01538 return (char *)p;
01539 }
01540
01541
01542 static long
01543 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01544 {
01545 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01546 if (!pp) return e - p;
01547 return pp - p;
01548 }
01549
01550 long
01551 rb_str_offset(VALUE str, long pos)
01552 {
01553 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01554 STR_ENC_GET(str), single_byte_optimizable(str));
01555 }
01556
01557 #ifdef NONASCII_MASK
01558 static char *
01559 str_utf8_nth(const char *p, const char *e, long *nthp)
01560 {
01561 long nth = *nthp;
01562 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01563 const VALUE *s, *t;
01564 const VALUE lowbits = sizeof(VALUE) - 1;
01565 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01566 t = (const VALUE*)(~lowbits & (VALUE)e);
01567 while (p < (const char *)s) {
01568 if (is_utf8_lead_byte(*p)) nth--;
01569 p++;
01570 }
01571 do {
01572 nth -= count_utf8_lead_bytes_with_word(s);
01573 s++;
01574 } while (s < t && (int)sizeof(VALUE) <= nth);
01575 p = (char *)s;
01576 }
01577 while (p < e) {
01578 if (is_utf8_lead_byte(*p)) {
01579 if (nth == 0) break;
01580 nth--;
01581 }
01582 p++;
01583 }
01584 *nthp = nth;
01585 return (char *)p;
01586 }
01587
01588 static long
01589 str_utf8_offset(const char *p, const char *e, long nth)
01590 {
01591 const char *pp = str_utf8_nth(p, e, &nth);
01592 return pp - p;
01593 }
01594 #endif
01595
01596
01597 long
01598 rb_str_sublen(VALUE str, long pos)
01599 {
01600 if (single_byte_optimizable(str) || pos < 0)
01601 return pos;
01602 else {
01603 char *p = RSTRING_PTR(str);
01604 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01605 }
01606 }
01607
01608 VALUE
01609 rb_str_subseq(VALUE str, long beg, long len)
01610 {
01611 VALUE str2;
01612
01613 if (RSTRING_LEN(str) == beg + len &&
01614 RSTRING_EMBED_LEN_MAX < len) {
01615 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01616 rb_str_drop_bytes(str2, beg);
01617 }
01618 else {
01619 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01620 }
01621
01622 rb_enc_cr_str_copy_for_substr(str2, str);
01623 OBJ_INFECT(str2, str);
01624
01625 return str2;
01626 }
01627
01628 VALUE
01629 rb_str_substr(VALUE str, long beg, long len)
01630 {
01631 rb_encoding *enc = STR_ENC_GET(str);
01632 VALUE str2;
01633 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01634
01635 if (len < 0) return Qnil;
01636 if (!RSTRING_LEN(str)) {
01637 len = 0;
01638 }
01639 if (single_byte_optimizable(str)) {
01640 if (beg > RSTRING_LEN(str)) return Qnil;
01641 if (beg < 0) {
01642 beg += RSTRING_LEN(str);
01643 if (beg < 0) return Qnil;
01644 }
01645 if (beg + len > RSTRING_LEN(str))
01646 len = RSTRING_LEN(str) - beg;
01647 if (len <= 0) {
01648 len = 0;
01649 p = 0;
01650 }
01651 else
01652 p = s + beg;
01653 goto sub;
01654 }
01655 if (beg < 0) {
01656 if (len > -beg) len = -beg;
01657 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01658 beg = -beg;
01659 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01660 p = e;
01661 if (!p) return Qnil;
01662 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01663 if (!p) return Qnil;
01664 len = e - p;
01665 goto sub;
01666 }
01667 else {
01668 beg += str_strlen(str, enc);
01669 if (beg < 0) return Qnil;
01670 }
01671 }
01672 else if (beg > 0 && beg > RSTRING_LEN(str)) {
01673 return Qnil;
01674 }
01675 if (len == 0) {
01676 if (beg > str_strlen(str, enc)) return Qnil;
01677 p = 0;
01678 }
01679 #ifdef NONASCII_MASK
01680 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01681 enc == rb_utf8_encoding()) {
01682 p = str_utf8_nth(s, e, &beg);
01683 if (beg > 0) return Qnil;
01684 len = str_utf8_offset(p, e, len);
01685 }
01686 #endif
01687 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01688 int char_sz = rb_enc_mbmaxlen(enc);
01689
01690 p = s + beg * char_sz;
01691 if (p > e) {
01692 return Qnil;
01693 }
01694 else if (len * char_sz > e - p)
01695 len = e - p;
01696 else
01697 len *= char_sz;
01698 }
01699 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01700 if (beg > 0) return Qnil;
01701 len = 0;
01702 }
01703 else {
01704 len = str_offset(p, e, len, enc, 0);
01705 }
01706 sub:
01707 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01708 str2 = rb_str_new4(str);
01709 str2 = str_new3(rb_obj_class(str2), str2);
01710 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01711 RSTRING(str2)->as.heap.len = len;
01712 }
01713 else {
01714 str2 = rb_str_new5(str, p, len);
01715 rb_enc_cr_str_copy_for_substr(str2, str);
01716 OBJ_INFECT(str2, str);
01717 }
01718
01719 return str2;
01720 }
01721
01722 VALUE
01723 rb_str_freeze(VALUE str)
01724 {
01725 if (STR_ASSOC_P(str)) {
01726 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01727 OBJ_FREEZE(ary);
01728 }
01729 return rb_obj_freeze(str);
01730 }
01731
01732 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01733 #define rb_str_dup_frozen rb_str_new_frozen
01734
01735 VALUE
01736 rb_str_locktmp(VALUE str)
01737 {
01738 if (FL_TEST(str, STR_TMPLOCK)) {
01739 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01740 }
01741 FL_SET(str, STR_TMPLOCK);
01742 return str;
01743 }
01744
01745 VALUE
01746 rb_str_unlocktmp(VALUE str)
01747 {
01748 if (!FL_TEST(str, STR_TMPLOCK)) {
01749 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01750 }
01751 FL_UNSET(str, STR_TMPLOCK);
01752 return str;
01753 }
01754
01755 VALUE
01756 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
01757 {
01758 rb_str_locktmp(str);
01759 return rb_ensure(func, arg, rb_str_unlocktmp, str);
01760 }
01761
01762 void
01763 rb_str_set_len(VALUE str, long len)
01764 {
01765 long capa;
01766
01767 str_modifiable(str);
01768 if (STR_SHARED_P(str)) {
01769 rb_raise(rb_eRuntimeError, "can't set length of shared string");
01770 }
01771 if (len > (capa = (long)rb_str_capacity(str))) {
01772 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01773 }
01774 STR_SET_LEN(str, len);
01775 RSTRING_PTR(str)[len] = '\0';
01776 }
01777
01778 VALUE
01779 rb_str_resize(VALUE str, long len)
01780 {
01781 long slen;
01782 int independent;
01783
01784 if (len < 0) {
01785 rb_raise(rb_eArgError, "negative string size (or size too big)");
01786 }
01787
01788 independent = str_independent(str);
01789 ENC_CODERANGE_CLEAR(str);
01790 slen = RSTRING_LEN(str);
01791 if (len != slen) {
01792 if (STR_EMBED_P(str)) {
01793 if (len <= RSTRING_EMBED_LEN_MAX) {
01794 STR_SET_EMBED_LEN(str, len);
01795 RSTRING(str)->as.ary[len] = '\0';
01796 return str;
01797 }
01798 str_make_independent_expand(str, len - slen);
01799 STR_SET_NOEMBED(str);
01800 }
01801 else if (len <= RSTRING_EMBED_LEN_MAX) {
01802 char *ptr = RSTRING(str)->as.heap.ptr;
01803 STR_SET_EMBED(str);
01804 if (slen > len) slen = len;
01805 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01806 RSTRING(str)->as.ary[len] = '\0';
01807 STR_SET_EMBED_LEN(str, len);
01808 if (independent) xfree(ptr);
01809 return str;
01810 }
01811 else if (!independent) {
01812 str_make_independent_expand(str, len - slen);
01813 }
01814 else if (slen < len || slen - len > 1024) {
01815 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01816 }
01817 if (!STR_NOCAPA_P(str)) {
01818 RSTRING(str)->as.heap.aux.capa = len;
01819 }
01820 RSTRING(str)->as.heap.len = len;
01821 RSTRING(str)->as.heap.ptr[len] = '\0';
01822 }
01823 return str;
01824 }
01825
01826 static VALUE
01827 str_buf_cat(VALUE str, const char *ptr, long len)
01828 {
01829 long capa, total, off = -1;
01830
01831 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01832 off = ptr - RSTRING_PTR(str);
01833 }
01834 rb_str_modify(str);
01835 if (len == 0) return 0;
01836 if (STR_ASSOC_P(str)) {
01837 FL_UNSET(str, STR_ASSOC);
01838 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01839 }
01840 else if (STR_EMBED_P(str)) {
01841 capa = RSTRING_EMBED_LEN_MAX;
01842 }
01843 else {
01844 capa = RSTRING(str)->as.heap.aux.capa;
01845 }
01846 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01847 rb_raise(rb_eArgError, "string sizes too big");
01848 }
01849 total = RSTRING_LEN(str)+len;
01850 if (capa <= total) {
01851 while (total > capa) {
01852 if (capa + 1 >= LONG_MAX / 2) {
01853 capa = (total + 4095) / 4096;
01854 break;
01855 }
01856 capa = (capa + 1) * 2;
01857 }
01858 RESIZE_CAPA(str, capa);
01859 }
01860 if (off != -1) {
01861 ptr = RSTRING_PTR(str) + off;
01862 }
01863 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01864 STR_SET_LEN(str, total);
01865 RSTRING_PTR(str)[total] = '\0';
01866
01867 return str;
01868 }
01869
01870 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01871
01872 VALUE
01873 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01874 {
01875 if (len == 0) return str;
01876 if (len < 0) {
01877 rb_raise(rb_eArgError, "negative string size (or size too big)");
01878 }
01879 return str_buf_cat(str, ptr, len);
01880 }
01881
01882 VALUE
01883 rb_str_buf_cat2(VALUE str, const char *ptr)
01884 {
01885 return rb_str_buf_cat(str, ptr, strlen(ptr));
01886 }
01887
01888 VALUE
01889 rb_str_cat(VALUE str, const char *ptr, long len)
01890 {
01891 if (len < 0) {
01892 rb_raise(rb_eArgError, "negative string size (or size too big)");
01893 }
01894 if (STR_ASSOC_P(str)) {
01895 char *p;
01896 rb_str_modify_expand(str, len);
01897 p = RSTRING(str)->as.heap.ptr;
01898 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01899 len = RSTRING(str)->as.heap.len += len;
01900 p[len] = '\0';
01901 return str;
01902 }
01903
01904 return rb_str_buf_cat(str, ptr, len);
01905 }
01906
01907 VALUE
01908 rb_str_cat2(VALUE str, const char *ptr)
01909 {
01910 return rb_str_cat(str, ptr, strlen(ptr));
01911 }
01912
01913 static VALUE
01914 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01915 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01916 {
01917 int str_encindex = ENCODING_GET(str);
01918 int res_encindex;
01919 int str_cr, res_cr;
01920
01921 str_cr = ENC_CODERANGE(str);
01922
01923 if (str_encindex == ptr_encindex) {
01924 if (str_cr == ENC_CODERANGE_UNKNOWN)
01925 ptr_cr = ENC_CODERANGE_UNKNOWN;
01926 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01927 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01928 }
01929 }
01930 else {
01931 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01932 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01933 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01934 if (len == 0)
01935 return str;
01936 if (RSTRING_LEN(str) == 0) {
01937 rb_str_buf_cat(str, ptr, len);
01938 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01939 return str;
01940 }
01941 goto incompatible;
01942 }
01943 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01944 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01945 }
01946 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01947 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01948 str_cr = rb_enc_str_coderange(str);
01949 }
01950 }
01951 }
01952 if (ptr_cr_ret)
01953 *ptr_cr_ret = ptr_cr;
01954
01955 if (str_encindex != ptr_encindex &&
01956 str_cr != ENC_CODERANGE_7BIT &&
01957 ptr_cr != ENC_CODERANGE_7BIT) {
01958 incompatible:
01959 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01960 rb_enc_name(rb_enc_from_index(str_encindex)),
01961 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01962 }
01963
01964 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01965 res_encindex = str_encindex;
01966 res_cr = ENC_CODERANGE_UNKNOWN;
01967 }
01968 else if (str_cr == ENC_CODERANGE_7BIT) {
01969 if (ptr_cr == ENC_CODERANGE_7BIT) {
01970 res_encindex = str_encindex;
01971 res_cr = ENC_CODERANGE_7BIT;
01972 }
01973 else {
01974 res_encindex = ptr_encindex;
01975 res_cr = ptr_cr;
01976 }
01977 }
01978 else if (str_cr == ENC_CODERANGE_VALID) {
01979 res_encindex = str_encindex;
01980 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01981 res_cr = str_cr;
01982 else
01983 res_cr = ptr_cr;
01984 }
01985 else {
01986 res_encindex = str_encindex;
01987 res_cr = str_cr;
01988 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01989 }
01990
01991 if (len < 0) {
01992 rb_raise(rb_eArgError, "negative string size (or size too big)");
01993 }
01994 str_buf_cat(str, ptr, len);
01995 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01996 return str;
01997 }
01998
01999 VALUE
02000 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02001 {
02002 return rb_enc_cr_str_buf_cat(str, ptr, len,
02003 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02004 }
02005
02006 VALUE
02007 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02008 {
02009
02010 int encindex = ENCODING_GET(str);
02011 rb_encoding *enc = rb_enc_from_index(encindex);
02012 if (rb_enc_asciicompat(enc)) {
02013 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02014 encindex, ENC_CODERANGE_7BIT, 0);
02015 }
02016 else {
02017 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02018 while (*ptr) {
02019 unsigned int c = (unsigned char)*ptr;
02020 int len = rb_enc_codelen(c, enc);
02021 rb_enc_mbcput(c, buf, enc);
02022 rb_enc_cr_str_buf_cat(str, buf, len,
02023 encindex, ENC_CODERANGE_VALID, 0);
02024 ptr++;
02025 }
02026 return str;
02027 }
02028 }
02029
02030 VALUE
02031 rb_str_buf_append(VALUE str, VALUE str2)
02032 {
02033 int str2_cr;
02034
02035 str2_cr = ENC_CODERANGE(str2);
02036
02037 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02038 ENCODING_GET(str2), str2_cr, &str2_cr);
02039
02040 OBJ_INFECT(str, str2);
02041 ENC_CODERANGE_SET(str2, str2_cr);
02042
02043 return str;
02044 }
02045
02046 VALUE
02047 rb_str_append(VALUE str, VALUE str2)
02048 {
02049 rb_encoding *enc;
02050 int cr, cr2;
02051 long len2;
02052
02053 StringValue(str2);
02054 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02055 long len = RSTRING_LEN(str) + len2;
02056 enc = rb_enc_check(str, str2);
02057 cr = ENC_CODERANGE(str);
02058 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02059 rb_str_modify_expand(str, len2);
02060 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02061 RSTRING_PTR(str2), len2+1);
02062 RSTRING(str)->as.heap.len = len;
02063 rb_enc_associate(str, enc);
02064 ENC_CODERANGE_SET(str, cr);
02065 OBJ_INFECT(str, str2);
02066 return str;
02067 }
02068 return rb_str_buf_append(str, str2);
02069 }
02070
02071
02072
02073
02074
02075
02076
02077
02078
02079
02080
02081
02082
02083
02084
02085
02086
02087 VALUE
02088 rb_str_concat(VALUE str1, VALUE str2)
02089 {
02090 unsigned int code;
02091 rb_encoding *enc = STR_ENC_GET(str1);
02092
02093 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02094 if (rb_num_to_uint(str2, &code) == 0) {
02095 }
02096 else if (FIXNUM_P(str2)) {
02097 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02098 }
02099 else {
02100 rb_raise(rb_eRangeError, "bignum out of char range");
02101 }
02102 }
02103 else {
02104 return rb_str_append(str1, str2);
02105 }
02106
02107 if (enc == rb_usascii_encoding()) {
02108
02109 char buf[1];
02110 buf[0] = (char)code;
02111 if (code > 0xFF) {
02112 rb_raise(rb_eRangeError, "%u out of char range", code);
02113 }
02114 rb_str_cat(str1, buf, 1);
02115 if (code > 127) {
02116 rb_enc_associate(str1, rb_ascii8bit_encoding());
02117 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02118 }
02119 }
02120 else {
02121 long pos = RSTRING_LEN(str1);
02122 int cr = ENC_CODERANGE(str1);
02123 int len;
02124 char *buf;
02125
02126 switch (len = rb_enc_codelen(code, enc)) {
02127 case ONIGERR_INVALID_CODE_POINT_VALUE:
02128 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02129 break;
02130 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02131 case 0:
02132 rb_raise(rb_eRangeError, "%u out of char range", code);
02133 break;
02134 }
02135 buf = ALLOCA_N(char, len + 1);
02136 rb_enc_mbcput(code, buf, enc);
02137 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02138 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02139 }
02140 rb_str_resize(str1, pos+len);
02141 strncpy(RSTRING_PTR(str1) + pos, buf, len);
02142 if (cr == ENC_CODERANGE_7BIT && code > 127)
02143 cr = ENC_CODERANGE_VALID;
02144 ENC_CODERANGE_SET(str1, cr);
02145 }
02146 return str1;
02147 }
02148
02149
02150
02151
02152
02153
02154
02155
02156
02157
02158
02159
02160 static VALUE
02161 rb_str_prepend(VALUE str, VALUE str2)
02162 {
02163 StringValue(str2);
02164 StringValue(str);
02165 rb_str_update(str, 0L, 0L, str2);
02166 return str;
02167 }
02168
02169 st_index_t
02170 rb_str_hash(VALUE str)
02171 {
02172 int e = ENCODING_GET(str);
02173 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02174 e = 0;
02175 }
02176 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02177 }
02178
02179 int
02180 rb_str_hash_cmp(VALUE str1, VALUE str2)
02181 {
02182 long len;
02183
02184 if (!rb_str_comparable(str1, str2)) return 1;
02185 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02186 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02187 return 0;
02188 }
02189 return 1;
02190 }
02191
02192
02193
02194
02195
02196
02197
02198
02199 static VALUE
02200 rb_str_hash_m(VALUE str)
02201 {
02202 st_index_t hval = rb_str_hash(str);
02203 return INT2FIX(hval);
02204 }
02205
02206 #define lesser(a,b) (((a)>(b))?(b):(a))
02207
02208 int
02209 rb_str_comparable(VALUE str1, VALUE str2)
02210 {
02211 int idx1, idx2;
02212 int rc1, rc2;
02213
02214 if (RSTRING_LEN(str1) == 0) return TRUE;
02215 if (RSTRING_LEN(str2) == 0) return TRUE;
02216 idx1 = ENCODING_GET(str1);
02217 idx2 = ENCODING_GET(str2);
02218 if (idx1 == idx2) return TRUE;
02219 rc1 = rb_enc_str_coderange(str1);
02220 rc2 = rb_enc_str_coderange(str2);
02221 if (rc1 == ENC_CODERANGE_7BIT) {
02222 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02223 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02224 return TRUE;
02225 }
02226 if (rc2 == ENC_CODERANGE_7BIT) {
02227 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02228 return TRUE;
02229 }
02230 return FALSE;
02231 }
02232
02233 int
02234 rb_str_cmp(VALUE str1, VALUE str2)
02235 {
02236 long len1, len2;
02237 const char *ptr1, *ptr2;
02238 int retval;
02239
02240 if (str1 == str2) return 0;
02241 RSTRING_GETMEM(str1, ptr1, len1);
02242 RSTRING_GETMEM(str2, ptr2, len2);
02243 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02244 if (len1 == len2) {
02245 if (!rb_str_comparable(str1, str2)) {
02246 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02247 return 1;
02248 return -1;
02249 }
02250 return 0;
02251 }
02252 if (len1 > len2) return 1;
02253 return -1;
02254 }
02255 if (retval > 0) return 1;
02256 return -1;
02257 }
02258
02259
02260 static VALUE
02261 str_eql(const VALUE str1, const VALUE str2)
02262 {
02263 const long len = RSTRING_LEN(str1);
02264 const char *ptr1, *ptr2;
02265
02266 if (len != RSTRING_LEN(str2)) return Qfalse;
02267 if (!rb_str_comparable(str1, str2)) return Qfalse;
02268 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02269 return Qtrue;
02270 if (memcmp(ptr1, ptr2, len) == 0)
02271 return Qtrue;
02272 return Qfalse;
02273 }
02274
02275
02276
02277
02278
02279
02280
02281
02282
02283 VALUE
02284 rb_str_equal(VALUE str1, VALUE str2)
02285 {
02286 if (str1 == str2) return Qtrue;
02287 if (TYPE(str2) != T_STRING) {
02288 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02289 return Qfalse;
02290 }
02291 return rb_equal(str2, str1);
02292 }
02293 return str_eql(str1, str2);
02294 }
02295
02296
02297
02298
02299
02300
02301
02302
02303 static VALUE
02304 rb_str_eql(VALUE str1, VALUE str2)
02305 {
02306 if (str1 == str2) return Qtrue;
02307 if (TYPE(str2) != T_STRING) return Qfalse;
02308 return str_eql(str1, str2);
02309 }
02310
02311
02312
02313
02314
02315
02316
02317
02318
02319
02320
02321
02322
02323
02324
02325
02326
02327
02328
02329
02330
02331
02332
02333
02334 static VALUE
02335 rb_str_cmp_m(VALUE str1, VALUE str2)
02336 {
02337 long result;
02338
02339 if (TYPE(str2) != T_STRING) {
02340 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02341 return Qnil;
02342 }
02343 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02344 return Qnil;
02345 }
02346 else {
02347 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02348
02349 if (NIL_P(tmp)) return Qnil;
02350 if (!FIXNUM_P(tmp)) {
02351 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02352 }
02353 result = -FIX2LONG(tmp);
02354 }
02355 }
02356 else {
02357 result = rb_str_cmp(str1, str2);
02358 }
02359 return LONG2NUM(result);
02360 }
02361
02362
02363
02364
02365
02366
02367
02368
02369
02370
02371
02372
02373
02374 static VALUE
02375 rb_str_casecmp(VALUE str1, VALUE str2)
02376 {
02377 long len;
02378 rb_encoding *enc;
02379 char *p1, *p1end, *p2, *p2end;
02380
02381 StringValue(str2);
02382 enc = rb_enc_compatible(str1, str2);
02383 if (!enc) {
02384 return Qnil;
02385 }
02386
02387 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02388 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02389 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02390 while (p1 < p1end && p2 < p2end) {
02391 if (*p1 != *p2) {
02392 unsigned int c1 = TOUPPER(*p1 & 0xff);
02393 unsigned int c2 = TOUPPER(*p2 & 0xff);
02394 if (c1 != c2)
02395 return INT2FIX(c1 < c2 ? -1 : 1);
02396 }
02397 p1++;
02398 p2++;
02399 }
02400 }
02401 else {
02402 while (p1 < p1end && p2 < p2end) {
02403 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02404 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02405
02406 if (0 <= c1 && 0 <= c2) {
02407 c1 = TOUPPER(c1);
02408 c2 = TOUPPER(c2);
02409 if (c1 != c2)
02410 return INT2FIX(c1 < c2 ? -1 : 1);
02411 }
02412 else {
02413 int r;
02414 l1 = rb_enc_mbclen(p1, p1end, enc);
02415 l2 = rb_enc_mbclen(p2, p2end, enc);
02416 len = l1 < l2 ? l1 : l2;
02417 r = memcmp(p1, p2, len);
02418 if (r != 0)
02419 return INT2FIX(r < 0 ? -1 : 1);
02420 if (l1 != l2)
02421 return INT2FIX(l1 < l2 ? -1 : 1);
02422 }
02423 p1 += l1;
02424 p2 += l2;
02425 }
02426 }
02427 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02428 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02429 return INT2FIX(-1);
02430 }
02431
02432 static long
02433 rb_str_index(VALUE str, VALUE sub, long offset)
02434 {
02435 long pos;
02436 char *s, *sptr, *e;
02437 long len, slen;
02438 rb_encoding *enc;
02439
02440 enc = rb_enc_check(str, sub);
02441 if (is_broken_string(sub)) {
02442 return -1;
02443 }
02444 len = str_strlen(str, enc);
02445 slen = str_strlen(sub, enc);
02446 if (offset < 0) {
02447 offset += len;
02448 if (offset < 0) return -1;
02449 }
02450 if (len - offset < slen) return -1;
02451 s = RSTRING_PTR(str);
02452 e = s + RSTRING_LEN(str);
02453 if (offset) {
02454 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02455 s += offset;
02456 }
02457 if (slen == 0) return offset;
02458
02459 sptr = RSTRING_PTR(sub);
02460 slen = RSTRING_LEN(sub);
02461 len = RSTRING_LEN(str) - offset;
02462 for (;;) {
02463 char *t;
02464 pos = rb_memsearch(sptr, slen, s, len, enc);
02465 if (pos < 0) return pos;
02466 t = rb_enc_right_char_head(s, s+pos, e, enc);
02467 if (t == s + pos) break;
02468 if ((len -= t - s) <= 0) return -1;
02469 offset += t - s;
02470 s = t;
02471 }
02472 return pos + offset;
02473 }
02474
02475
02476
02477
02478
02479
02480
02481
02482
02483
02484
02485
02486
02487
02488
02489
02490
02491
02492
02493 static VALUE
02494 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02495 {
02496 VALUE sub;
02497 VALUE initpos;
02498 long pos;
02499
02500 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02501 pos = NUM2LONG(initpos);
02502 }
02503 else {
02504 pos = 0;
02505 }
02506 if (pos < 0) {
02507 pos += str_strlen(str, STR_ENC_GET(str));
02508 if (pos < 0) {
02509 if (TYPE(sub) == T_REGEXP) {
02510 rb_backref_set(Qnil);
02511 }
02512 return Qnil;
02513 }
02514 }
02515
02516 switch (TYPE(sub)) {
02517 case T_REGEXP:
02518 if (pos > str_strlen(str, STR_ENC_GET(str)))
02519 return Qnil;
02520 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02521 rb_enc_check(str, sub), single_byte_optimizable(str));
02522
02523 pos = rb_reg_search(sub, str, pos, 0);
02524 pos = rb_str_sublen(str, pos);
02525 break;
02526
02527 default: {
02528 VALUE tmp;
02529
02530 tmp = rb_check_string_type(sub);
02531 if (NIL_P(tmp)) {
02532 rb_raise(rb_eTypeError, "type mismatch: %s given",
02533 rb_obj_classname(sub));
02534 }
02535 sub = tmp;
02536 }
02537
02538 case T_STRING:
02539 pos = rb_str_index(str, sub, pos);
02540 pos = rb_str_sublen(str, pos);
02541 break;
02542 }
02543
02544 if (pos == -1) return Qnil;
02545 return LONG2NUM(pos);
02546 }
02547
02548 static long
02549 rb_str_rindex(VALUE str, VALUE sub, long pos)
02550 {
02551 long len, slen;
02552 char *s, *sbeg, *e, *t;
02553 rb_encoding *enc;
02554 int singlebyte = single_byte_optimizable(str);
02555
02556 enc = rb_enc_check(str, sub);
02557 if (is_broken_string(sub)) {
02558 return -1;
02559 }
02560 len = str_strlen(str, enc);
02561 slen = str_strlen(sub, enc);
02562
02563 if (len < slen) return -1;
02564 if (len - pos < slen) {
02565 pos = len - slen;
02566 }
02567 if (len == 0) {
02568 return pos;
02569 }
02570 sbeg = RSTRING_PTR(str);
02571 e = RSTRING_END(str);
02572 t = RSTRING_PTR(sub);
02573 slen = RSTRING_LEN(sub);
02574 s = str_nth(sbeg, e, pos, enc, singlebyte);
02575 while (s) {
02576 if (memcmp(s, t, slen) == 0) {
02577 return pos;
02578 }
02579 if (pos == 0) break;
02580 pos--;
02581 s = rb_enc_prev_char(sbeg, s, e, enc);
02582 }
02583 return -1;
02584 }
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595
02596
02597
02598
02599
02600
02601
02602
02603
02604
02605 static VALUE
02606 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02607 {
02608 VALUE sub;
02609 VALUE vpos;
02610 rb_encoding *enc = STR_ENC_GET(str);
02611 long pos, len = str_strlen(str, enc);
02612
02613 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02614 pos = NUM2LONG(vpos);
02615 if (pos < 0) {
02616 pos += len;
02617 if (pos < 0) {
02618 if (TYPE(sub) == T_REGEXP) {
02619 rb_backref_set(Qnil);
02620 }
02621 return Qnil;
02622 }
02623 }
02624 if (pos > len) pos = len;
02625 }
02626 else {
02627 pos = len;
02628 }
02629
02630 switch (TYPE(sub)) {
02631 case T_REGEXP:
02632
02633 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02634 STR_ENC_GET(str), single_byte_optimizable(str));
02635
02636 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02637 pos = rb_reg_search(sub, str, pos, 1);
02638 pos = rb_str_sublen(str, pos);
02639 }
02640 if (pos >= 0) return LONG2NUM(pos);
02641 break;
02642
02643 default: {
02644 VALUE tmp;
02645
02646 tmp = rb_check_string_type(sub);
02647 if (NIL_P(tmp)) {
02648 rb_raise(rb_eTypeError, "type mismatch: %s given",
02649 rb_obj_classname(sub));
02650 }
02651 sub = tmp;
02652 }
02653
02654 case T_STRING:
02655 pos = rb_str_rindex(str, sub, pos);
02656 if (pos >= 0) return LONG2NUM(pos);
02657 break;
02658 }
02659 return Qnil;
02660 }
02661
02662
02663
02664
02665
02666
02667
02668
02669
02670
02671
02672
02673
02674
02675
02676 static VALUE
02677 rb_str_match(VALUE x, VALUE y)
02678 {
02679 switch (TYPE(y)) {
02680 case T_STRING:
02681 rb_raise(rb_eTypeError, "type mismatch: String given");
02682
02683 case T_REGEXP:
02684 return rb_reg_match(y, x);
02685
02686 default:
02687 return rb_funcall(y, rb_intern("=~"), 1, x);
02688 }
02689 }
02690
02691
02692 static VALUE get_pat(VALUE, int);
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724 static VALUE
02725 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02726 {
02727 VALUE re, result;
02728 if (argc < 1)
02729 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02730 re = argv[0];
02731 argv[0] = str;
02732 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02733 if (!NIL_P(result) && rb_block_given_p()) {
02734 return rb_yield(result);
02735 }
02736 return result;
02737 }
02738
02739 enum neighbor_char {
02740 NEIGHBOR_NOT_CHAR,
02741 NEIGHBOR_FOUND,
02742 NEIGHBOR_WRAPPED
02743 };
02744
02745 static enum neighbor_char
02746 enc_succ_char(char *p, long len, rb_encoding *enc)
02747 {
02748 long i;
02749 int l;
02750 while (1) {
02751 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02752 p[i] = '\0';
02753 if (i < 0)
02754 return NEIGHBOR_WRAPPED;
02755 ++((unsigned char*)p)[i];
02756 l = rb_enc_precise_mbclen(p, p+len, enc);
02757 if (MBCLEN_CHARFOUND_P(l)) {
02758 l = MBCLEN_CHARFOUND_LEN(l);
02759 if (l == len) {
02760 return NEIGHBOR_FOUND;
02761 }
02762 else {
02763 memset(p+l, 0xff, len-l);
02764 }
02765 }
02766 if (MBCLEN_INVALID_P(l) && i < len-1) {
02767 long len2;
02768 int l2;
02769 for (len2 = len-1; 0 < len2; len2--) {
02770 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02771 if (!MBCLEN_INVALID_P(l2))
02772 break;
02773 }
02774 memset(p+len2+1, 0xff, len-(len2+1));
02775 }
02776 }
02777 }
02778
02779 static enum neighbor_char
02780 enc_pred_char(char *p, long len, rb_encoding *enc)
02781 {
02782 long i;
02783 int l;
02784 while (1) {
02785 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02786 p[i] = '\xff';
02787 if (i < 0)
02788 return NEIGHBOR_WRAPPED;
02789 --((unsigned char*)p)[i];
02790 l = rb_enc_precise_mbclen(p, p+len, enc);
02791 if (MBCLEN_CHARFOUND_P(l)) {
02792 l = MBCLEN_CHARFOUND_LEN(l);
02793 if (l == len) {
02794 return NEIGHBOR_FOUND;
02795 }
02796 else {
02797 memset(p+l, 0, len-l);
02798 }
02799 }
02800 if (MBCLEN_INVALID_P(l) && i < len-1) {
02801 long len2;
02802 int l2;
02803 for (len2 = len-1; 0 < len2; len2--) {
02804 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02805 if (!MBCLEN_INVALID_P(l2))
02806 break;
02807 }
02808 memset(p+len2+1, 0, len-(len2+1));
02809 }
02810 }
02811 }
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822 static enum neighbor_char
02823 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02824 {
02825 enum neighbor_char ret;
02826 unsigned int c;
02827 int ctype;
02828 int range;
02829 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02830
02831 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02832 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02833 ctype = ONIGENC_CTYPE_DIGIT;
02834 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02835 ctype = ONIGENC_CTYPE_ALPHA;
02836 else
02837 return NEIGHBOR_NOT_CHAR;
02838
02839 MEMCPY(save, p, char, len);
02840 ret = enc_succ_char(p, len, enc);
02841 if (ret == NEIGHBOR_FOUND) {
02842 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02843 if (rb_enc_isctype(c, ctype, enc))
02844 return NEIGHBOR_FOUND;
02845 }
02846 MEMCPY(p, save, char, len);
02847 range = 1;
02848 while (1) {
02849 MEMCPY(save, p, char, len);
02850 ret = enc_pred_char(p, len, enc);
02851 if (ret == NEIGHBOR_FOUND) {
02852 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02853 if (!rb_enc_isctype(c, ctype, enc)) {
02854 MEMCPY(p, save, char, len);
02855 break;
02856 }
02857 }
02858 else {
02859 MEMCPY(p, save, char, len);
02860 break;
02861 }
02862 range++;
02863 }
02864 if (range == 1) {
02865 return NEIGHBOR_NOT_CHAR;
02866 }
02867
02868 if (ctype != ONIGENC_CTYPE_DIGIT) {
02869 MEMCPY(carry, p, char, len);
02870 return NEIGHBOR_WRAPPED;
02871 }
02872
02873 MEMCPY(carry, p, char, len);
02874 enc_succ_char(carry, len, enc);
02875 return NEIGHBOR_WRAPPED;
02876 }
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902
02903
02904 VALUE
02905 rb_str_succ(VALUE orig)
02906 {
02907 rb_encoding *enc;
02908 VALUE str;
02909 char *sbeg, *s, *e, *last_alnum = 0;
02910 int c = -1;
02911 long l;
02912 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02913 long carry_pos = 0, carry_len = 1;
02914 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02915
02916 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02917 rb_enc_cr_str_copy_for_substr(str, orig);
02918 OBJ_INFECT(str, orig);
02919 if (RSTRING_LEN(str) == 0) return str;
02920
02921 enc = STR_ENC_GET(orig);
02922 sbeg = RSTRING_PTR(str);
02923 s = e = sbeg + RSTRING_LEN(str);
02924
02925 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02926 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02927 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02928 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02929 s = last_alnum;
02930 break;
02931 }
02932 }
02933 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02934 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02935 switch (neighbor) {
02936 case NEIGHBOR_NOT_CHAR:
02937 continue;
02938 case NEIGHBOR_FOUND:
02939 return str;
02940 case NEIGHBOR_WRAPPED:
02941 last_alnum = s;
02942 break;
02943 }
02944 c = 1;
02945 carry_pos = s - sbeg;
02946 carry_len = l;
02947 }
02948 if (c == -1) {
02949 s = e;
02950 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02951 enum neighbor_char neighbor;
02952 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02953 neighbor = enc_succ_char(s, l, enc);
02954 if (neighbor == NEIGHBOR_FOUND)
02955 return str;
02956 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02957
02958 enc_succ_char(s, l, enc);
02959 }
02960 if (!rb_enc_asciicompat(enc)) {
02961 MEMCPY(carry, s, char, l);
02962 carry_len = l;
02963 }
02964 carry_pos = s - sbeg;
02965 }
02966 }
02967 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02968 s = RSTRING_PTR(str) + carry_pos;
02969 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02970 memmove(s, carry, carry_len);
02971 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02972 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02973 rb_enc_str_coderange(str);
02974 return str;
02975 }
02976
02977
02978
02979
02980
02981
02982
02983
02984
02985
02986
02987 static VALUE
02988 rb_str_succ_bang(VALUE str)
02989 {
02990 rb_str_shared_replace(str, rb_str_succ(str));
02991
02992 return str;
02993 }
02994
02995
02996
02997
02998
02999
03000
03001
03002
03003
03004
03005
03006
03007
03008
03009
03010
03011
03012
03013
03014
03015
03016
03017
03018
03019
03020
03021
03022
03023
03024
03025
03026
03027
03028 static VALUE
03029 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03030 {
03031 VALUE end, exclusive;
03032 VALUE current, after_end;
03033 ID succ;
03034 int n, excl, ascii;
03035 rb_encoding *enc;
03036
03037 rb_scan_args(argc, argv, "11", &end, &exclusive);
03038 RETURN_ENUMERATOR(beg, argc, argv);
03039 excl = RTEST(exclusive);
03040 CONST_ID(succ, "succ");
03041 StringValue(end);
03042 enc = rb_enc_check(beg, end);
03043 ascii = (is_ascii_string(beg) && is_ascii_string(end));
03044
03045 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03046 char c = RSTRING_PTR(beg)[0];
03047 char e = RSTRING_PTR(end)[0];
03048
03049 if (c > e || (excl && c == e)) return beg;
03050 for (;;) {
03051 rb_yield(rb_enc_str_new(&c, 1, enc));
03052 if (!excl && c == e) break;
03053 c++;
03054 if (excl && c == e) break;
03055 }
03056 return beg;
03057 }
03058
03059 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03060 char *s, *send;
03061 VALUE b, e;
03062 int width;
03063
03064 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03065 width = rb_long2int(send - s);
03066 while (s < send) {
03067 if (!ISDIGIT(*s)) goto no_digits;
03068 s++;
03069 }
03070 s = RSTRING_PTR(end); send = RSTRING_END(end);
03071 while (s < send) {
03072 if (!ISDIGIT(*s)) goto no_digits;
03073 s++;
03074 }
03075 b = rb_str_to_inum(beg, 10, FALSE);
03076 e = rb_str_to_inum(end, 10, FALSE);
03077 if (FIXNUM_P(b) && FIXNUM_P(e)) {
03078 long bi = FIX2LONG(b);
03079 long ei = FIX2LONG(e);
03080 rb_encoding *usascii = rb_usascii_encoding();
03081
03082 while (bi <= ei) {
03083 if (excl && bi == ei) break;
03084 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03085 bi++;
03086 }
03087 }
03088 else {
03089 ID op = excl ? '<' : rb_intern("<=");
03090 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03091
03092 args[0] = INT2FIX(width);
03093 while (rb_funcall(b, op, 1, e)) {
03094 args[1] = b;
03095 rb_yield(rb_str_format(numberof(args), args, fmt));
03096 b = rb_funcall(b, succ, 0, 0);
03097 }
03098 }
03099 return beg;
03100 }
03101
03102 no_digits:
03103 n = rb_str_cmp(beg, end);
03104 if (n > 0 || (excl && n == 0)) return beg;
03105
03106 after_end = rb_funcall(end, succ, 0, 0);
03107 current = rb_str_dup(beg);
03108 while (!rb_str_equal(current, after_end)) {
03109 VALUE next = Qnil;
03110 if (excl || !rb_str_equal(current, end))
03111 next = rb_funcall(current, succ, 0, 0);
03112 rb_yield(current);
03113 if (NIL_P(next)) break;
03114 current = next;
03115 StringValue(current);
03116 if (excl && rb_str_equal(current, end)) break;
03117 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03118 break;
03119 }
03120
03121 return beg;
03122 }
03123
03124 static VALUE
03125 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03126 {
03127 if (rb_reg_search(re, str, 0, 0) >= 0) {
03128 VALUE match = rb_backref_get();
03129 int nth = rb_reg_backref_number(match, backref);
03130 return rb_reg_nth_match(nth, match);
03131 }
03132 return Qnil;
03133 }
03134
03135 static VALUE
03136 rb_str_aref(VALUE str, VALUE indx)
03137 {
03138 long idx;
03139
03140 switch (TYPE(indx)) {
03141 case T_FIXNUM:
03142 idx = FIX2LONG(indx);
03143
03144 num_index:
03145 str = rb_str_substr(str, idx, 1);
03146 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03147 return str;
03148
03149 case T_REGEXP:
03150 return rb_str_subpat(str, indx, INT2FIX(0));
03151
03152 case T_STRING:
03153 if (rb_str_index(str, indx, 0) != -1)
03154 return rb_str_dup(indx);
03155 return Qnil;
03156
03157 default:
03158
03159 {
03160 long beg, len;
03161 VALUE tmp;
03162
03163 len = str_strlen(str, STR_ENC_GET(str));
03164 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03165 case Qfalse:
03166 break;
03167 case Qnil:
03168 return Qnil;
03169 default:
03170 tmp = rb_str_substr(str, beg, len);
03171 return tmp;
03172 }
03173 }
03174 idx = NUM2LONG(indx);
03175 goto num_index;
03176 }
03177 return Qnil;
03178 }
03179
03180
03181
03182
03183
03184
03185
03186
03187
03188
03189
03190
03191
03192
03193
03194
03195
03196
03197
03198
03199
03200
03201
03202
03203
03204
03205
03206
03207
03208
03209
03210
03211
03212
03213
03214
03215
03216
03217
03218
03219
03220
03221
03222
03223
03224
03225
03226
03227
03228
03229
03230 static VALUE
03231 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03232 {
03233 if (argc == 2) {
03234 if (TYPE(argv[0]) == T_REGEXP) {
03235 return rb_str_subpat(str, argv[0], argv[1]);
03236 }
03237 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03238 }
03239 if (argc != 1) {
03240 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03241 }
03242 return rb_str_aref(str, argv[0]);
03243 }
03244
03245 VALUE
03246 rb_str_drop_bytes(VALUE str, long len)
03247 {
03248 char *ptr = RSTRING_PTR(str);
03249 long olen = RSTRING_LEN(str), nlen;
03250
03251 str_modifiable(str);
03252 if (len > olen) len = olen;
03253 nlen = olen - len;
03254 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03255 char *oldptr = ptr;
03256 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03257 STR_SET_EMBED(str);
03258 STR_SET_EMBED_LEN(str, nlen);
03259 ptr = RSTRING(str)->as.ary;
03260 memmove(ptr, oldptr + len, nlen);
03261 if (fl == STR_NOEMBED) xfree(oldptr);
03262 }
03263 else {
03264 if (!STR_SHARED_P(str)) rb_str_new4(str);
03265 ptr = RSTRING(str)->as.heap.ptr += len;
03266 RSTRING(str)->as.heap.len = nlen;
03267 }
03268 ptr[nlen] = 0;
03269 ENC_CODERANGE_CLEAR(str);
03270 return str;
03271 }
03272
03273 static void
03274 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03275 {
03276 if (beg == 0 && RSTRING_LEN(val) == 0) {
03277 rb_str_drop_bytes(str, len);
03278 OBJ_INFECT(str, val);
03279 return;
03280 }
03281
03282 rb_str_modify(str);
03283 if (len < RSTRING_LEN(val)) {
03284
03285 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03286 }
03287
03288 if (RSTRING_LEN(val) != len) {
03289 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03290 RSTRING_PTR(str) + beg + len,
03291 RSTRING_LEN(str) - (beg + len));
03292 }
03293 if (RSTRING_LEN(val) < beg && len < 0) {
03294 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03295 }
03296 if (RSTRING_LEN(val) > 0) {
03297 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03298 }
03299 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03300 if (RSTRING_PTR(str)) {
03301 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03302 }
03303 OBJ_INFECT(str, val);
03304 }
03305
03306 static void
03307 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03308 {
03309 long slen;
03310 char *p, *e;
03311 rb_encoding *enc;
03312 int singlebyte = single_byte_optimizable(str);
03313 int cr;
03314
03315 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03316
03317 StringValue(val);
03318 enc = rb_enc_check(str, val);
03319 slen = str_strlen(str, enc);
03320
03321 if (slen < beg) {
03322 out_of_range:
03323 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03324 }
03325 if (beg < 0) {
03326 if (-beg > slen) {
03327 goto out_of_range;
03328 }
03329 beg += slen;
03330 }
03331 if (slen < len || slen < beg + len) {
03332 len = slen - beg;
03333 }
03334 str_modify_keep_cr(str);
03335 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03336 if (!p) p = RSTRING_END(str);
03337 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03338 if (!e) e = RSTRING_END(str);
03339
03340 beg = p - RSTRING_PTR(str);
03341 len = e - p;
03342 rb_str_splice_0(str, beg, len, val);
03343 rb_enc_associate(str, enc);
03344 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03345 if (cr != ENC_CODERANGE_BROKEN)
03346 ENC_CODERANGE_SET(str, cr);
03347 }
03348
03349 void
03350 rb_str_update(VALUE str, long beg, long len, VALUE val)
03351 {
03352 rb_str_splice(str, beg, len, val);
03353 }
03354
03355 static void
03356 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03357 {
03358 int nth;
03359 VALUE match;
03360 long start, end, len;
03361 rb_encoding *enc;
03362 struct re_registers *regs;
03363
03364 if (rb_reg_search(re, str, 0, 0) < 0) {
03365 rb_raise(rb_eIndexError, "regexp not matched");
03366 }
03367 match = rb_backref_get();
03368 nth = rb_reg_backref_number(match, backref);
03369 regs = RMATCH_REGS(match);
03370 if (nth >= regs->num_regs) {
03371 out_of_range:
03372 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03373 }
03374 if (nth < 0) {
03375 if (-nth >= regs->num_regs) {
03376 goto out_of_range;
03377 }
03378 nth += regs->num_regs;
03379 }
03380
03381 start = BEG(nth);
03382 if (start == -1) {
03383 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03384 }
03385 end = END(nth);
03386 len = end - start;
03387 StringValue(val);
03388 enc = rb_enc_check(str, val);
03389 rb_str_splice_0(str, start, len, val);
03390 rb_enc_associate(str, enc);
03391 }
03392
03393 static VALUE
03394 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03395 {
03396 long idx, beg;
03397
03398 switch (TYPE(indx)) {
03399 case T_FIXNUM:
03400 idx = FIX2LONG(indx);
03401 num_index:
03402 rb_str_splice(str, idx, 1, val);
03403 return val;
03404
03405 case T_REGEXP:
03406 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03407 return val;
03408
03409 case T_STRING:
03410 beg = rb_str_index(str, indx, 0);
03411 if (beg < 0) {
03412 rb_raise(rb_eIndexError, "string not matched");
03413 }
03414 beg = rb_str_sublen(str, beg);
03415 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03416 return val;
03417
03418 default:
03419
03420 {
03421 long beg, len;
03422 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03423 rb_str_splice(str, beg, len, val);
03424 return val;
03425 }
03426 }
03427 idx = NUM2LONG(indx);
03428 goto num_index;
03429 }
03430 }
03431
03432
03433
03434
03435
03436
03437
03438
03439
03440
03441
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457 static VALUE
03458 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03459 {
03460 if (argc == 3) {
03461 if (TYPE(argv[0]) == T_REGEXP) {
03462 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03463 }
03464 else {
03465 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03466 }
03467 return argv[2];
03468 }
03469 if (argc != 2) {
03470 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03471 }
03472 return rb_str_aset(str, argv[0], argv[1]);
03473 }
03474
03475
03476
03477
03478
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491
03492 static VALUE
03493 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03494 {
03495 long pos = NUM2LONG(idx);
03496
03497 if (pos == -1) {
03498 return rb_str_append(str, str2);
03499 }
03500 else if (pos < 0) {
03501 pos++;
03502 }
03503 rb_str_splice(str, pos, 0, str2);
03504 return str;
03505 }
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521
03522
03523
03524
03525
03526
03527 static VALUE
03528 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03529 {
03530 VALUE result;
03531 VALUE buf[3];
03532 int i;
03533
03534 if (argc < 1 || 2 < argc) {
03535 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03536 }
03537 for (i=0; i<argc; i++) {
03538 buf[i] = argv[i];
03539 }
03540 str_modify_keep_cr(str);
03541 result = rb_str_aref_m(argc, buf, str);
03542 if (!NIL_P(result)) {
03543 buf[i] = rb_str_new(0,0);
03544 rb_str_aset_m(argc+1, buf, str);
03545 }
03546 return result;
03547 }
03548
03549 static VALUE
03550 get_pat(VALUE pat, int quote)
03551 {
03552 VALUE val;
03553
03554 switch (TYPE(pat)) {
03555 case T_REGEXP:
03556 return pat;
03557
03558 case T_STRING:
03559 break;
03560
03561 default:
03562 val = rb_check_string_type(pat);
03563 if (NIL_P(val)) {
03564 Check_Type(pat, T_REGEXP);
03565 }
03566 pat = val;
03567 }
03568
03569 if (quote) {
03570 pat = rb_reg_quote(pat);
03571 }
03572
03573 return rb_reg_regcomp(pat);
03574 }
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587 static VALUE
03588 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03589 {
03590 VALUE pat, repl, hash = Qnil;
03591 int iter = 0;
03592 int tainted = 0;
03593 int untrusted = 0;
03594 long plen;
03595
03596 if (argc == 1 && rb_block_given_p()) {
03597 iter = 1;
03598 }
03599 else if (argc == 2) {
03600 repl = argv[1];
03601 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03602 if (NIL_P(hash)) {
03603 StringValue(repl);
03604 }
03605 if (OBJ_TAINTED(repl)) tainted = 1;
03606 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03607 }
03608 else {
03609 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03610 }
03611
03612 pat = get_pat(argv[0], 1);
03613 str_modifiable(str);
03614 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03615 rb_encoding *enc;
03616 int cr = ENC_CODERANGE(str);
03617 VALUE match = rb_backref_get();
03618 struct re_registers *regs = RMATCH_REGS(match);
03619 long beg0 = BEG(0);
03620 long end0 = END(0);
03621 char *p, *rp;
03622 long len, rlen;
03623
03624 if (iter || !NIL_P(hash)) {
03625 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03626
03627 if (iter) {
03628 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03629 }
03630 else {
03631 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03632 repl = rb_obj_as_string(repl);
03633 }
03634 str_mod_check(str, p, len);
03635 rb_check_frozen(str);
03636 }
03637 else {
03638 repl = rb_reg_regsub(repl, str, regs, pat);
03639 }
03640 enc = rb_enc_compatible(str, repl);
03641 if (!enc) {
03642 rb_encoding *str_enc = STR_ENC_GET(str);
03643 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03644 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03645 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03646 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03647 rb_enc_name(str_enc),
03648 rb_enc_name(STR_ENC_GET(repl)));
03649 }
03650 enc = STR_ENC_GET(repl);
03651 }
03652 rb_str_modify(str);
03653 rb_enc_associate(str, enc);
03654 if (OBJ_TAINTED(repl)) tainted = 1;
03655 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03656 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03657 int cr2 = ENC_CODERANGE(repl);
03658 if (cr2 == ENC_CODERANGE_BROKEN ||
03659 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03660 cr = ENC_CODERANGE_UNKNOWN;
03661 else
03662 cr = cr2;
03663 }
03664 plen = end0 - beg0;
03665 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03666 len = RSTRING_LEN(str);
03667 if (rlen > plen) {
03668 RESIZE_CAPA(str, len + rlen - plen);
03669 }
03670 p = RSTRING_PTR(str);
03671 if (rlen != plen) {
03672 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03673 }
03674 memcpy(p + beg0, rp, rlen);
03675 len += rlen - plen;
03676 STR_SET_LEN(str, len);
03677 RSTRING_PTR(str)[len] = '\0';
03678 ENC_CODERANGE_SET(str, cr);
03679 if (tainted) OBJ_TAINT(str);
03680 if (untrusted) OBJ_UNTRUST(str);
03681
03682 return str;
03683 }
03684 return Qnil;
03685 }
03686
03687
03688
03689
03690
03691
03692
03693
03694
03695
03696
03697
03698
03699
03700
03701
03702
03703
03704
03705
03706
03707
03708
03709
03710
03711
03712
03713
03714
03715
03716
03717
03718
03719
03720
03721
03722
03723
03724
03725
03726
03727
03728 static VALUE
03729 rb_str_sub(int argc, VALUE *argv, VALUE str)
03730 {
03731 str = rb_str_dup(str);
03732 rb_str_sub_bang(argc, argv, str);
03733 return str;
03734 }
03735
03736 static VALUE
03737 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03738 {
03739 VALUE pat, val, repl, match, dest, hash = Qnil;
03740 struct re_registers *regs;
03741 long beg, n;
03742 long beg0, end0;
03743 long offset, blen, slen, len, last;
03744 int iter = 0;
03745 char *sp, *cp;
03746 int tainted = 0;
03747 rb_encoding *str_enc;
03748
03749 switch (argc) {
03750 case 1:
03751 RETURN_ENUMERATOR(str, argc, argv);
03752 iter = 1;
03753 break;
03754 case 2:
03755 repl = argv[1];
03756 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03757 if (NIL_P(hash)) {
03758 StringValue(repl);
03759 }
03760 if (OBJ_TAINTED(repl)) tainted = 1;
03761 break;
03762 default:
03763 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03764 }
03765
03766 pat = get_pat(argv[0], 1);
03767 beg = rb_reg_search(pat, str, 0, 0);
03768 if (beg < 0) {
03769 if (bang) return Qnil;
03770 return rb_str_dup(str);
03771 }
03772
03773 offset = 0;
03774 n = 0;
03775 blen = RSTRING_LEN(str) + 30;
03776 dest = rb_str_buf_new(blen);
03777 sp = RSTRING_PTR(str);
03778 slen = RSTRING_LEN(str);
03779 cp = sp;
03780 str_enc = STR_ENC_GET(str);
03781 rb_enc_associate(dest, str_enc);
03782 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03783
03784 do {
03785 n++;
03786 match = rb_backref_get();
03787 regs = RMATCH_REGS(match);
03788 beg0 = BEG(0);
03789 end0 = END(0);
03790 if (iter || !NIL_P(hash)) {
03791 if (iter) {
03792 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03793 }
03794 else {
03795 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03796 val = rb_obj_as_string(val);
03797 }
03798 str_mod_check(str, sp, slen);
03799 if (val == dest) {
03800 rb_raise(rb_eRuntimeError, "block should not cheat");
03801 }
03802 }
03803 else {
03804 val = rb_reg_regsub(repl, str, regs, pat);
03805 }
03806
03807 if (OBJ_TAINTED(val)) tainted = 1;
03808
03809 len = beg - offset;
03810 if (len) {
03811 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03812 }
03813
03814 rb_str_buf_append(dest, val);
03815
03816 last = offset;
03817 offset = end0;
03818 if (beg0 == end0) {
03819
03820
03821
03822
03823 if (RSTRING_LEN(str) <= end0) break;
03824 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03825 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03826 offset = end0 + len;
03827 }
03828 cp = RSTRING_PTR(str) + offset;
03829 if (offset > RSTRING_LEN(str)) break;
03830 beg = rb_reg_search(pat, str, offset, 0);
03831 } while (beg >= 0);
03832 if (RSTRING_LEN(str) > offset) {
03833 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03834 }
03835 rb_reg_search(pat, str, last, 0);
03836 if (bang) {
03837 rb_str_shared_replace(str, dest);
03838 }
03839 else {
03840 RBASIC(dest)->klass = rb_obj_class(str);
03841 OBJ_INFECT(dest, str);
03842 str = dest;
03843 }
03844
03845 if (tainted) OBJ_TAINT(str);
03846 return str;
03847 }
03848
03849
03850
03851
03852
03853
03854
03855
03856
03857
03858
03859
03860
03861 static VALUE
03862 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03863 {
03864 str_modify_keep_cr(str);
03865 return str_gsub(argc, argv, str, 1);
03866 }
03867
03868
03869
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904
03905
03906
03907
03908
03909
03910
03911
03912 static VALUE
03913 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03914 {
03915 return str_gsub(argc, argv, str, 0);
03916 }
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930 VALUE
03931 rb_str_replace(VALUE str, VALUE str2)
03932 {
03933 str_modifiable(str);
03934 if (str == str2) return str;
03935
03936 StringValue(str2);
03937 str_discard(str);
03938 return str_replace(str, str2);
03939 }
03940
03941
03942
03943
03944
03945
03946
03947
03948
03949
03950
03951 static VALUE
03952 rb_str_clear(VALUE str)
03953 {
03954 str_discard(str);
03955 STR_SET_EMBED(str);
03956 STR_SET_EMBED_LEN(str, 0);
03957 RSTRING_PTR(str)[0] = 0;
03958 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03959 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03960 else
03961 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03962 return str;
03963 }
03964
03965
03966
03967
03968
03969
03970
03971
03972
03973
03974
03975 static VALUE
03976 rb_str_chr(VALUE str)
03977 {
03978 return rb_str_substr(str, 0, 1);
03979 }
03980
03981
03982
03983
03984
03985
03986
03987 static VALUE
03988 rb_str_getbyte(VALUE str, VALUE index)
03989 {
03990 long pos = NUM2LONG(index);
03991
03992 if (pos < 0)
03993 pos += RSTRING_LEN(str);
03994 if (pos < 0 || RSTRING_LEN(str) <= pos)
03995 return Qnil;
03996
03997 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03998 }
03999
04000
04001
04002
04003
04004
04005
04006 static VALUE
04007 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04008 {
04009 long pos = NUM2LONG(index);
04010 int byte = NUM2INT(value);
04011
04012 rb_str_modify(str);
04013
04014 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04015 rb_raise(rb_eIndexError, "index %ld out of string", pos);
04016 if (pos < 0)
04017 pos += RSTRING_LEN(str);
04018
04019 RSTRING_PTR(str)[pos] = byte;
04020
04021 return value;
04022 }
04023
04024 static VALUE
04025 str_byte_substr(VALUE str, long beg, long len)
04026 {
04027 char *p, *s = RSTRING_PTR(str);
04028 long n = RSTRING_LEN(str);
04029 VALUE str2;
04030
04031 if (beg > n || len < 0) return Qnil;
04032 if (beg < 0) {
04033 beg += n;
04034 if (beg < 0) return Qnil;
04035 }
04036 if (beg + len > n)
04037 len = n - beg;
04038 if (len <= 0) {
04039 len = 0;
04040 p = 0;
04041 }
04042 else
04043 p = s + beg;
04044
04045 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04046 str2 = rb_str_new4(str);
04047 str2 = str_new3(rb_obj_class(str2), str2);
04048 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04049 RSTRING(str2)->as.heap.len = len;
04050 }
04051 else {
04052 str2 = rb_str_new5(str, p, len);
04053 }
04054
04055 str_enc_copy(str2, str);
04056
04057 if (RSTRING_LEN(str2) == 0) {
04058 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04059 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04060 else
04061 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04062 }
04063 else {
04064 switch (ENC_CODERANGE(str)) {
04065 case ENC_CODERANGE_7BIT:
04066 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04067 break;
04068 default:
04069 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04070 break;
04071 }
04072 }
04073
04074 OBJ_INFECT(str2, str);
04075
04076 return str2;
04077 }
04078
04079 static VALUE
04080 str_byte_aref(VALUE str, VALUE indx)
04081 {
04082 long idx;
04083 switch (TYPE(indx)) {
04084 case T_FIXNUM:
04085 idx = FIX2LONG(indx);
04086
04087 num_index:
04088 str = str_byte_substr(str, idx, 1);
04089 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04090 return str;
04091
04092 default:
04093
04094 {
04095 long beg, len = RSTRING_LEN(str);
04096
04097 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04098 case Qfalse:
04099 break;
04100 case Qnil:
04101 return Qnil;
04102 default:
04103 return str_byte_substr(str, beg, len);
04104 }
04105 }
04106 idx = NUM2LONG(indx);
04107 goto num_index;
04108 }
04109 return Qnil;
04110 }
04111
04112
04113
04114
04115
04116
04117
04118
04119
04120
04121
04122
04123
04124
04125
04126
04127
04128
04129
04130
04131
04132
04133
04134
04135 static VALUE
04136 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04137 {
04138 if (argc == 2) {
04139 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04140 }
04141 if (argc != 1) {
04142 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04143 }
04144 return str_byte_aref(str, argv[0]);
04145 }
04146
04147
04148
04149
04150
04151
04152
04153
04154
04155
04156 static VALUE
04157 rb_str_reverse(VALUE str)
04158 {
04159 rb_encoding *enc;
04160 VALUE rev;
04161 char *s, *e, *p;
04162 int single = 1;
04163
04164 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04165 enc = STR_ENC_GET(str);
04166 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04167 s = RSTRING_PTR(str); e = RSTRING_END(str);
04168 p = RSTRING_END(rev);
04169
04170 if (RSTRING_LEN(str) > 1) {
04171 if (single_byte_optimizable(str)) {
04172 while (s < e) {
04173 *--p = *s++;
04174 }
04175 }
04176 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04177 while (s < e) {
04178 int clen = rb_enc_fast_mbclen(s, e, enc);
04179
04180 if (clen > 1 || (*s & 0x80)) single = 0;
04181 p -= clen;
04182 memcpy(p, s, clen);
04183 s += clen;
04184 }
04185 }
04186 else {
04187 while (s < e) {
04188 int clen = rb_enc_mbclen(s, e, enc);
04189
04190 if (clen > 1 || (*s & 0x80)) single = 0;
04191 p -= clen;
04192 memcpy(p, s, clen);
04193 s += clen;
04194 }
04195 }
04196 }
04197 STR_SET_LEN(rev, RSTRING_LEN(str));
04198 OBJ_INFECT(rev, str);
04199 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04200 if (single) {
04201 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04202 }
04203 else {
04204 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04205 }
04206 }
04207 rb_enc_cr_str_copy_for_substr(rev, str);
04208
04209 return rev;
04210 }
04211
04212
04213
04214
04215
04216
04217
04218
04219
04220 static VALUE
04221 rb_str_reverse_bang(VALUE str)
04222 {
04223 if (RSTRING_LEN(str) > 1) {
04224 if (single_byte_optimizable(str)) {
04225 char *s, *e, c;
04226
04227 str_modify_keep_cr(str);
04228 s = RSTRING_PTR(str);
04229 e = RSTRING_END(str) - 1;
04230 while (s < e) {
04231 c = *s;
04232 *s++ = *e;
04233 *e-- = c;
04234 }
04235 }
04236 else {
04237 rb_str_shared_replace(str, rb_str_reverse(str));
04238 }
04239 }
04240 else {
04241 str_modify_keep_cr(str);
04242 }
04243 return str;
04244 }
04245
04246
04247
04248
04249
04250
04251
04252
04253
04254
04255
04256
04257
04258
04259 static VALUE
04260 rb_str_include(VALUE str, VALUE arg)
04261 {
04262 long i;
04263
04264 StringValue(arg);
04265 i = rb_str_index(str, arg, 0);
04266
04267 if (i == -1) return Qfalse;
04268 return Qtrue;
04269 }
04270
04271
04272
04273
04274
04275
04276
04277
04278
04279
04280
04281
04282
04283
04284
04285
04286
04287
04288
04289
04290
04291
04292
04293 static VALUE
04294 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04295 {
04296 int base;
04297
04298 if (argc == 0) base = 10;
04299 else {
04300 VALUE b;
04301
04302 rb_scan_args(argc, argv, "01", &b);
04303 base = NUM2INT(b);
04304 }
04305 if (base < 0) {
04306 rb_raise(rb_eArgError, "invalid radix %d", base);
04307 }
04308 return rb_str_to_inum(str, base, FALSE);
04309 }
04310
04311
04312
04313
04314
04315
04316
04317
04318
04319
04320
04321
04322
04323
04324
04325
04326 static VALUE
04327 rb_str_to_f(VALUE str)
04328 {
04329 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04330 }
04331
04332
04333
04334
04335
04336
04337
04338
04339
04340
04341 static VALUE
04342 rb_str_to_s(VALUE str)
04343 {
04344 if (rb_obj_class(str) != rb_cString) {
04345 return str_duplicate(rb_cString, str);
04346 }
04347 return str;
04348 }
04349
04350 #if 0
04351 static void
04352 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04353 {
04354 char s[RUBY_MAX_CHAR_LEN];
04355 int n = rb_enc_codelen(c, enc);
04356
04357 rb_enc_mbcput(c, s, enc);
04358 rb_enc_str_buf_cat(str, s, n, enc);
04359 }
04360 #endif
04361
04362 #define CHAR_ESC_LEN 13
04363
04364 int
04365 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04366 {
04367 char buf[CHAR_ESC_LEN + 1];
04368 int l;
04369
04370 #if SIZEOF_INT > 4
04371 c &= 0xffffffff;
04372 #endif
04373 if (unicode_p) {
04374 if (c < 0x7F && ISPRINT(c)) {
04375 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04376 }
04377 else if (c < 0x10000) {
04378 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04379 }
04380 else {
04381 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04382 }
04383 }
04384 else {
04385 if (c < 0x100) {
04386 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04387 }
04388 else {
04389 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04390 }
04391 }
04392 l = (int)strlen(buf);
04393 rb_str_buf_cat(result, buf, l);
04394 return l;
04395 }
04396
04397
04398
04399
04400
04401
04402
04403
04404
04405
04406
04407
04408
04409 VALUE
04410 rb_str_inspect(VALUE str)
04411 {
04412 rb_encoding *enc = STR_ENC_GET(str);
04413 const char *p, *pend, *prev;
04414 char buf[CHAR_ESC_LEN + 1];
04415 VALUE result = rb_str_buf_new(0);
04416 rb_encoding *resenc = rb_default_internal_encoding();
04417 int unicode_p = rb_enc_unicode_p(enc);
04418 int asciicompat = rb_enc_asciicompat(enc);
04419 static rb_encoding *utf16, *utf32;
04420
04421 if (!utf16) utf16 = rb_enc_find("UTF-16");
04422 if (!utf32) utf32 = rb_enc_find("UTF-32");
04423 if (resenc == NULL) resenc = rb_default_external_encoding();
04424 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04425 rb_enc_associate(result, resenc);
04426 str_buf_cat2(result, "\"");
04427
04428 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04429 prev = p;
04430 if (enc == utf16) {
04431 const unsigned char *q = (const unsigned char *)p;
04432 if (q[0] == 0xFE && q[1] == 0xFF)
04433 enc = rb_enc_find("UTF-16BE");
04434 else if (q[0] == 0xFF && q[1] == 0xFE)
04435 enc = rb_enc_find("UTF-16LE");
04436 else
04437 unicode_p = 0;
04438 }
04439 else if (enc == utf32) {
04440 const unsigned char *q = (const unsigned char *)p;
04441 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04442 enc = rb_enc_find("UTF-32BE");
04443 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04444 enc = rb_enc_find("UTF-32LE");
04445 else
04446 unicode_p = 0;
04447 }
04448 while (p < pend) {
04449 unsigned int c, cc;
04450 int n;
04451
04452 n = rb_enc_precise_mbclen(p, pend, enc);
04453 if (!MBCLEN_CHARFOUND_P(n)) {
04454 if (p > prev) str_buf_cat(result, prev, p - prev);
04455 n = rb_enc_mbminlen(enc);
04456 if (pend < p + n)
04457 n = (int)(pend - p);
04458 while (n--) {
04459 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04460 str_buf_cat(result, buf, strlen(buf));
04461 prev = ++p;
04462 }
04463 continue;
04464 }
04465 n = MBCLEN_CHARFOUND_LEN(n);
04466 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04467 p += n;
04468 if ((asciicompat || unicode_p) &&
04469 (c == '"'|| c == '\\' ||
04470 (c == '#' &&
04471 p < pend &&
04472 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04473 (cc = rb_enc_codepoint(p,pend,enc),
04474 (cc == '$' || cc == '@' || cc == '{'))))) {
04475 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04476 str_buf_cat2(result, "\\");
04477 if (asciicompat || enc == resenc) {
04478 prev = p - n;
04479 continue;
04480 }
04481 }
04482 switch (c) {
04483 case '\n': cc = 'n'; break;
04484 case '\r': cc = 'r'; break;
04485 case '\t': cc = 't'; break;
04486 case '\f': cc = 'f'; break;
04487 case '\013': cc = 'v'; break;
04488 case '\010': cc = 'b'; break;
04489 case '\007': cc = 'a'; break;
04490 case 033: cc = 'e'; break;
04491 default: cc = 0; break;
04492 }
04493 if (cc) {
04494 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04495 buf[0] = '\\';
04496 buf[1] = (char)cc;
04497 str_buf_cat(result, buf, 2);
04498 prev = p;
04499 continue;
04500 }
04501 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04502 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04503 continue;
04504 }
04505 else {
04506 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04507 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04508 prev = p;
04509 continue;
04510 }
04511 }
04512 if (p > prev) str_buf_cat(result, prev, p - prev);
04513 str_buf_cat2(result, "\"");
04514
04515 OBJ_INFECT(result, str);
04516 return result;
04517 }
04518
04519 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04520
04521
04522
04523
04524
04525
04526
04527
04528
04529 VALUE
04530 rb_str_dump(VALUE str)
04531 {
04532 rb_encoding *enc = rb_enc_get(str);
04533 long len;
04534 const char *p, *pend;
04535 char *q, *qend;
04536 VALUE result;
04537 int u8 = (enc == rb_utf8_encoding());
04538
04539 len = 2;
04540 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04541 while (p < pend) {
04542 unsigned char c = *p++;
04543 switch (c) {
04544 case '"': case '\\':
04545 case '\n': case '\r':
04546 case '\t': case '\f':
04547 case '\013': case '\010': case '\007': case '\033':
04548 len += 2;
04549 break;
04550
04551 case '#':
04552 len += IS_EVSTR(p, pend) ? 2 : 1;
04553 break;
04554
04555 default:
04556 if (ISPRINT(c)) {
04557 len++;
04558 }
04559 else {
04560 if (u8) {
04561 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04562 if (MBCLEN_CHARFOUND_P(n-1)) {
04563 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04564 while (cc >>= 4) len++;
04565 len += 5;
04566 p += MBCLEN_CHARFOUND_LEN(n)-1;
04567 break;
04568 }
04569 }
04570 len += 4;
04571 }
04572 break;
04573 }
04574 }
04575 if (!rb_enc_asciicompat(enc)) {
04576 len += 19;
04577 len += strlen(enc->name);
04578 }
04579
04580 result = rb_str_new5(str, 0, len);
04581 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04582 q = RSTRING_PTR(result); qend = q + len + 1;
04583
04584 *q++ = '"';
04585 while (p < pend) {
04586 unsigned char c = *p++;
04587
04588 if (c == '"' || c == '\\') {
04589 *q++ = '\\';
04590 *q++ = c;
04591 }
04592 else if (c == '#') {
04593 if (IS_EVSTR(p, pend)) *q++ = '\\';
04594 *q++ = '#';
04595 }
04596 else if (c == '\n') {
04597 *q++ = '\\';
04598 *q++ = 'n';
04599 }
04600 else if (c == '\r') {
04601 *q++ = '\\';
04602 *q++ = 'r';
04603 }
04604 else if (c == '\t') {
04605 *q++ = '\\';
04606 *q++ = 't';
04607 }
04608 else if (c == '\f') {
04609 *q++ = '\\';
04610 *q++ = 'f';
04611 }
04612 else if (c == '\013') {
04613 *q++ = '\\';
04614 *q++ = 'v';
04615 }
04616 else if (c == '\010') {
04617 *q++ = '\\';
04618 *q++ = 'b';
04619 }
04620 else if (c == '\007') {
04621 *q++ = '\\';
04622 *q++ = 'a';
04623 }
04624 else if (c == '\033') {
04625 *q++ = '\\';
04626 *q++ = 'e';
04627 }
04628 else if (ISPRINT(c)) {
04629 *q++ = c;
04630 }
04631 else {
04632 *q++ = '\\';
04633 if (u8) {
04634 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04635 if (MBCLEN_CHARFOUND_P(n)) {
04636 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04637 p += n;
04638 snprintf(q, qend-q, "u{%x}", cc);
04639 q += strlen(q);
04640 continue;
04641 }
04642 }
04643 snprintf(q, qend-q, "x%02X", c);
04644 q += 3;
04645 }
04646 }
04647 *q++ = '"';
04648 *q = '\0';
04649 if (!rb_enc_asciicompat(enc)) {
04650 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04651 enc = rb_ascii8bit_encoding();
04652 }
04653 OBJ_INFECT(result, str);
04654
04655 rb_enc_associate(result, enc);
04656 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04657 return result;
04658 }
04659
04660
04661 static void
04662 rb_str_check_dummy_enc(rb_encoding *enc)
04663 {
04664 if (rb_enc_dummy_p(enc)) {
04665 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04666 rb_enc_name(enc));
04667 }
04668 }
04669
04670
04671
04672
04673
04674
04675
04676
04677
04678
04679 static VALUE
04680 rb_str_upcase_bang(VALUE str)
04681 {
04682 rb_encoding *enc;
04683 char *s, *send;
04684 int modify = 0;
04685 int n;
04686
04687 str_modify_keep_cr(str);
04688 enc = STR_ENC_GET(str);
04689 rb_str_check_dummy_enc(enc);
04690 s = RSTRING_PTR(str); send = RSTRING_END(str);
04691 if (single_byte_optimizable(str)) {
04692 while (s < send) {
04693 unsigned int c = *(unsigned char*)s;
04694
04695 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04696 *s = 'A' + (c - 'a');
04697 modify = 1;
04698 }
04699 s++;
04700 }
04701 }
04702 else {
04703 int ascompat = rb_enc_asciicompat(enc);
04704
04705 while (s < send) {
04706 unsigned int c;
04707
04708 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04709 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04710 *s = 'A' + (c - 'a');
04711 modify = 1;
04712 }
04713 s++;
04714 }
04715 else {
04716 c = rb_enc_codepoint_len(s, send, &n, enc);
04717 if (rb_enc_islower(c, enc)) {
04718
04719 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04720 modify = 1;
04721 }
04722 s += n;
04723 }
04724 }
04725 }
04726
04727 if (modify) return str;
04728 return Qnil;
04729 }
04730
04731
04732
04733
04734
04735
04736
04737
04738
04739
04740
04741
04742
04743
04744 static VALUE
04745 rb_str_upcase(VALUE str)
04746 {
04747 str = rb_str_dup(str);
04748 rb_str_upcase_bang(str);
04749 return str;
04750 }
04751
04752
04753
04754
04755
04756
04757
04758
04759
04760
04761
04762 static VALUE
04763 rb_str_downcase_bang(VALUE str)
04764 {
04765 rb_encoding *enc;
04766 char *s, *send;
04767 int modify = 0;
04768
04769 str_modify_keep_cr(str);
04770 enc = STR_ENC_GET(str);
04771 rb_str_check_dummy_enc(enc);
04772 s = RSTRING_PTR(str); send = RSTRING_END(str);
04773 if (single_byte_optimizable(str)) {
04774 while (s < send) {
04775 unsigned int c = *(unsigned char*)s;
04776
04777 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04778 *s = 'a' + (c - 'A');
04779 modify = 1;
04780 }
04781 s++;
04782 }
04783 }
04784 else {
04785 int ascompat = rb_enc_asciicompat(enc);
04786
04787 while (s < send) {
04788 unsigned int c;
04789 int n;
04790
04791 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04792 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04793 *s = 'a' + (c - 'A');
04794 modify = 1;
04795 }
04796 s++;
04797 }
04798 else {
04799 c = rb_enc_codepoint_len(s, send, &n, enc);
04800 if (rb_enc_isupper(c, enc)) {
04801
04802 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04803 modify = 1;
04804 }
04805 s += n;
04806 }
04807 }
04808 }
04809
04810 if (modify) return str;
04811 return Qnil;
04812 }
04813
04814
04815
04816
04817
04818
04819
04820
04821
04822
04823
04824
04825
04826
04827 static VALUE
04828 rb_str_downcase(VALUE str)
04829 {
04830 str = rb_str_dup(str);
04831 rb_str_downcase_bang(str);
04832 return str;
04833 }
04834
04835
04836
04837
04838
04839
04840
04841
04842
04843
04844
04845
04846
04847
04848
04849
04850 static VALUE
04851 rb_str_capitalize_bang(VALUE str)
04852 {
04853 rb_encoding *enc;
04854 char *s, *send;
04855 int modify = 0;
04856 unsigned int c;
04857 int n;
04858
04859 str_modify_keep_cr(str);
04860 enc = STR_ENC_GET(str);
04861 rb_str_check_dummy_enc(enc);
04862 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04863 s = RSTRING_PTR(str); send = RSTRING_END(str);
04864
04865 c = rb_enc_codepoint_len(s, send, &n, enc);
04866 if (rb_enc_islower(c, enc)) {
04867 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04868 modify = 1;
04869 }
04870 s += n;
04871 while (s < send) {
04872 c = rb_enc_codepoint_len(s, send, &n, enc);
04873 if (rb_enc_isupper(c, enc)) {
04874 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04875 modify = 1;
04876 }
04877 s += n;
04878 }
04879
04880 if (modify) return str;
04881 return Qnil;
04882 }
04883
04884
04885
04886
04887
04888
04889
04890
04891
04892
04893
04894
04895
04896
04897
04898 static VALUE
04899 rb_str_capitalize(VALUE str)
04900 {
04901 str = rb_str_dup(str);
04902 rb_str_capitalize_bang(str);
04903 return str;
04904 }
04905
04906
04907
04908
04909
04910
04911
04912
04913
04914
04915
04916 static VALUE
04917 rb_str_swapcase_bang(VALUE str)
04918 {
04919 rb_encoding *enc;
04920 char *s, *send;
04921 int modify = 0;
04922 int n;
04923
04924 str_modify_keep_cr(str);
04925 enc = STR_ENC_GET(str);
04926 rb_str_check_dummy_enc(enc);
04927 s = RSTRING_PTR(str); send = RSTRING_END(str);
04928 while (s < send) {
04929 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04930
04931 if (rb_enc_isupper(c, enc)) {
04932
04933 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04934 modify = 1;
04935 }
04936 else if (rb_enc_islower(c, enc)) {
04937
04938 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04939 modify = 1;
04940 }
04941 s += n;
04942 }
04943
04944 if (modify) return str;
04945 return Qnil;
04946 }
04947
04948
04949
04950
04951
04952
04953
04954
04955
04956
04957
04958
04959
04960
04961 static VALUE
04962 rb_str_swapcase(VALUE str)
04963 {
04964 str = rb_str_dup(str);
04965 rb_str_swapcase_bang(str);
04966 return str;
04967 }
04968
04969 typedef unsigned char *USTR;
04970
04971 struct tr {
04972 int gen;
04973 unsigned int now, max;
04974 char *p, *pend;
04975 };
04976
04977 static unsigned int
04978 trnext(struct tr *t, rb_encoding *enc)
04979 {
04980 int n;
04981
04982 for (;;) {
04983 if (!t->gen) {
04984 if (t->p == t->pend) return -1;
04985 if (t->p < t->pend - 1 && *t->p == '\\') {
04986 t->p++;
04987 }
04988 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04989 t->p += n;
04990 if (t->p < t->pend - 1 && *t->p == '-') {
04991 t->p++;
04992 if (t->p < t->pend) {
04993 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04994 t->p += n;
04995 if (t->now > c) {
04996 if (t->now < 0x80 && c < 0x80) {
04997 rb_raise(rb_eArgError,
04998 "invalid range \"%c-%c\" in string transliteration",
04999 t->now, c);
05000 }
05001 else {
05002 rb_raise(rb_eArgError, "invalid range in string transliteration");
05003 }
05004 continue;
05005 }
05006 t->gen = 1;
05007 t->max = c;
05008 }
05009 }
05010 return t->now;
05011 }
05012 else if (++t->now < t->max) {
05013 return t->now;
05014 }
05015 else {
05016 t->gen = 0;
05017 return t->max;
05018 }
05019 }
05020 }
05021
05022 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05023
05024 static VALUE
05025 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05026 {
05027 const unsigned int errc = -1;
05028 unsigned int trans[256];
05029 rb_encoding *enc, *e1, *e2;
05030 struct tr trsrc, trrepl;
05031 int cflag = 0;
05032 unsigned int c, c0, last = 0;
05033 int modify = 0, i, l;
05034 char *s, *send;
05035 VALUE hash = 0;
05036 int singlebyte = single_byte_optimizable(str);
05037 int cr;
05038
05039 #define CHECK_IF_ASCII(c) \
05040 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05041 (cr = ENC_CODERANGE_VALID) : 0)
05042
05043 StringValue(src);
05044 StringValue(repl);
05045 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05046 if (RSTRING_LEN(repl) == 0) {
05047 return rb_str_delete_bang(1, &src, str);
05048 }
05049
05050 cr = ENC_CODERANGE(str);
05051 e1 = rb_enc_check(str, src);
05052 e2 = rb_enc_check(str, repl);
05053 if (e1 == e2) {
05054 enc = e1;
05055 }
05056 else {
05057 enc = rb_enc_check(src, repl);
05058 }
05059 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05060 if (RSTRING_LEN(src) > 1 &&
05061 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05062 trsrc.p + l < trsrc.pend) {
05063 cflag = 1;
05064 trsrc.p += l;
05065 }
05066 trrepl.p = RSTRING_PTR(repl);
05067 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05068 trsrc.gen = trrepl.gen = 0;
05069 trsrc.now = trrepl.now = 0;
05070 trsrc.max = trrepl.max = 0;
05071
05072 if (cflag) {
05073 for (i=0; i<256; i++) {
05074 trans[i] = 1;
05075 }
05076 while ((c = trnext(&trsrc, enc)) != errc) {
05077 if (c < 256) {
05078 trans[c] = errc;
05079 }
05080 else {
05081 if (!hash) hash = rb_hash_new();
05082 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05083 }
05084 }
05085 while ((c = trnext(&trrepl, enc)) != errc)
05086 ;
05087 last = trrepl.now;
05088 for (i=0; i<256; i++) {
05089 if (trans[i] != errc) {
05090 trans[i] = last;
05091 }
05092 }
05093 }
05094 else {
05095 unsigned int r;
05096
05097 for (i=0; i<256; i++) {
05098 trans[i] = errc;
05099 }
05100 while ((c = trnext(&trsrc, enc)) != errc) {
05101 r = trnext(&trrepl, enc);
05102 if (r == errc) r = trrepl.now;
05103 if (c < 256) {
05104 trans[c] = r;
05105 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05106 }
05107 else {
05108 if (!hash) hash = rb_hash_new();
05109 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05110 }
05111 }
05112 }
05113
05114 if (cr == ENC_CODERANGE_VALID)
05115 cr = ENC_CODERANGE_7BIT;
05116 str_modify_keep_cr(str);
05117 s = RSTRING_PTR(str); send = RSTRING_END(str);
05118 if (sflag) {
05119 int clen, tlen;
05120 long offset, max = RSTRING_LEN(str);
05121 unsigned int save = -1;
05122 char *buf = ALLOC_N(char, max), *t = buf;
05123
05124 while (s < send) {
05125 int may_modify = 0;
05126
05127 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05128 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05129
05130 s += clen;
05131 if (c < 256) {
05132 c = trans[c];
05133 }
05134 else if (hash) {
05135 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05136 if (NIL_P(tmp)) {
05137 if (cflag) c = last;
05138 else c = errc;
05139 }
05140 else if (cflag) c = errc;
05141 else c = NUM2INT(tmp);
05142 }
05143 else {
05144 c = errc;
05145 }
05146 if (c != (unsigned int)-1) {
05147 if (save == c) {
05148 CHECK_IF_ASCII(c);
05149 continue;
05150 }
05151 save = c;
05152 tlen = rb_enc_codelen(c, enc);
05153 modify = 1;
05154 }
05155 else {
05156 save = -1;
05157 c = c0;
05158 if (enc != e1) may_modify = 1;
05159 }
05160 while (t - buf + tlen >= max) {
05161 offset = t - buf;
05162 max *= 2;
05163 REALLOC_N(buf, char, max);
05164 t = buf + offset;
05165 }
05166 rb_enc_mbcput(c, t, enc);
05167 if (may_modify && memcmp(s, t, tlen) != 0) {
05168 modify = 1;
05169 }
05170 CHECK_IF_ASCII(c);
05171 t += tlen;
05172 }
05173 if (!STR_EMBED_P(str)) {
05174 xfree(RSTRING(str)->as.heap.ptr);
05175 }
05176 *t = '\0';
05177 RSTRING(str)->as.heap.ptr = buf;
05178 RSTRING(str)->as.heap.len = t - buf;
05179 STR_SET_NOEMBED(str);
05180 RSTRING(str)->as.heap.aux.capa = max;
05181 }
05182 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05183 while (s < send) {
05184 c = (unsigned char)*s;
05185 if (trans[c] != errc) {
05186 if (!cflag) {
05187 c = trans[c];
05188 *s = c;
05189 modify = 1;
05190 }
05191 else {
05192 *s = last;
05193 modify = 1;
05194 }
05195 }
05196 CHECK_IF_ASCII(c);
05197 s++;
05198 }
05199 }
05200 else {
05201 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05202 long offset;
05203 char *buf = ALLOC_N(char, max), *t = buf;
05204
05205 while (s < send) {
05206 int may_modify = 0;
05207 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05208 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05209
05210 if (c < 256) {
05211 c = trans[c];
05212 }
05213 else if (hash) {
05214 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05215 if (NIL_P(tmp)) {
05216 if (cflag) c = last;
05217 else c = errc;
05218 }
05219 else if (cflag) c = errc;
05220 else c = NUM2INT(tmp);
05221 }
05222 else {
05223 c = cflag ? last : errc;
05224 }
05225 if (c != errc) {
05226 tlen = rb_enc_codelen(c, enc);
05227 modify = 1;
05228 }
05229 else {
05230 c = c0;
05231 if (enc != e1) may_modify = 1;
05232 }
05233 while (t - buf + tlen >= max) {
05234 offset = t - buf;
05235 max *= 2;
05236 REALLOC_N(buf, char, max);
05237 t = buf + offset;
05238 }
05239 if (s != t) {
05240 rb_enc_mbcput(c, t, enc);
05241 if (may_modify && memcmp(s, t, tlen) != 0) {
05242 modify = 1;
05243 }
05244 }
05245 CHECK_IF_ASCII(c);
05246 s += clen;
05247 t += tlen;
05248 }
05249 if (!STR_EMBED_P(str)) {
05250 xfree(RSTRING(str)->as.heap.ptr);
05251 }
05252 *t = '\0';
05253 RSTRING(str)->as.heap.ptr = buf;
05254 RSTRING(str)->as.heap.len = t - buf;
05255 STR_SET_NOEMBED(str);
05256 RSTRING(str)->as.heap.aux.capa = max;
05257 }
05258
05259 if (modify) {
05260 if (cr != ENC_CODERANGE_BROKEN)
05261 ENC_CODERANGE_SET(str, cr);
05262 rb_enc_associate(str, enc);
05263 return str;
05264 }
05265 return Qnil;
05266 }
05267
05268
05269
05270
05271
05272
05273
05274
05275
05276
05277
05278 static VALUE
05279 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05280 {
05281 return tr_trans(str, src, repl, 0);
05282 }
05283
05284
05285
05286
05287
05288
05289
05290
05291
05292
05293
05294
05295
05296
05297
05298
05299
05300
05301
05302
05303
05304
05305 static VALUE
05306 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05307 {
05308 str = rb_str_dup(str);
05309 tr_trans(str, src, repl, 0);
05310 return str;
05311 }
05312
05313 #define TR_TABLE_SIZE 257
05314 static void
05315 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05316 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05317 {
05318 const unsigned int errc = -1;
05319 char buf[256];
05320 struct tr tr;
05321 unsigned int c;
05322 VALUE table = 0, ptable = 0;
05323 int i, l, cflag = 0;
05324
05325 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05326 tr.gen = tr.now = tr.max = 0;
05327
05328 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05329 cflag = 1;
05330 tr.p += l;
05331 }
05332 if (first) {
05333 for (i=0; i<256; i++) {
05334 stable[i] = 1;
05335 }
05336 stable[256] = cflag;
05337 }
05338 else if (stable[256] && !cflag) {
05339 stable[256] = 0;
05340 }
05341 for (i=0; i<256; i++) {
05342 buf[i] = cflag;
05343 }
05344
05345 while ((c = trnext(&tr, enc)) != errc) {
05346 if (c < 256) {
05347 buf[c & 0xff] = !cflag;
05348 }
05349 else {
05350 VALUE key = UINT2NUM(c);
05351
05352 if (!table) {
05353 table = rb_hash_new();
05354 if (cflag) {
05355 ptable = *ctablep;
05356 *ctablep = table;
05357 }
05358 else {
05359 ptable = *tablep;
05360 *tablep = table;
05361 }
05362 }
05363 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05364 rb_hash_aset(table, key, Qtrue);
05365 }
05366 }
05367 }
05368 for (i=0; i<256; i++) {
05369 stable[i] = stable[i] && buf[i];
05370 }
05371 }
05372
05373
05374 static int
05375 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05376 {
05377 if (c < 256) {
05378 return table[c] != 0;
05379 }
05380 else {
05381 VALUE v = UINT2NUM(c);
05382
05383 if (del) {
05384 if (!NIL_P(rb_hash_lookup(del, v)) &&
05385 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05386 return TRUE;
05387 }
05388 }
05389 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05390 return FALSE;
05391 }
05392 return table[256] ? TRUE : FALSE;
05393 }
05394 }
05395
05396
05397
05398
05399
05400
05401
05402
05403
05404 static VALUE
05405 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05406 {
05407 char squeez[TR_TABLE_SIZE];
05408 rb_encoding *enc = 0;
05409 char *s, *send, *t;
05410 VALUE del = 0, nodel = 0;
05411 int modify = 0;
05412 int i, ascompat, cr;
05413
05414 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05415 if (argc < 1) {
05416 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05417 }
05418 for (i=0; i<argc; i++) {
05419 VALUE s = argv[i];
05420
05421 StringValue(s);
05422 enc = rb_enc_check(str, s);
05423 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05424 }
05425
05426 str_modify_keep_cr(str);
05427 ascompat = rb_enc_asciicompat(enc);
05428 s = t = RSTRING_PTR(str);
05429 send = RSTRING_END(str);
05430 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05431 while (s < send) {
05432 unsigned int c;
05433 int clen;
05434
05435 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05436 if (squeez[c]) {
05437 modify = 1;
05438 }
05439 else {
05440 if (t != s) *t = c;
05441 t++;
05442 }
05443 s++;
05444 }
05445 else {
05446 c = rb_enc_codepoint_len(s, send, &clen, enc);
05447
05448 if (tr_find(c, squeez, del, nodel)) {
05449 modify = 1;
05450 }
05451 else {
05452 if (t != s) rb_enc_mbcput(c, t, enc);
05453 t += clen;
05454 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05455 }
05456 s += clen;
05457 }
05458 }
05459 *t = '\0';
05460 STR_SET_LEN(str, t - RSTRING_PTR(str));
05461 ENC_CODERANGE_SET(str, cr);
05462
05463 if (modify) return str;
05464 return Qnil;
05465 }
05466
05467
05468
05469
05470
05471
05472
05473
05474
05475
05476
05477
05478
05479
05480
05481
05482 static VALUE
05483 rb_str_delete(int argc, VALUE *argv, VALUE str)
05484 {
05485 str = rb_str_dup(str);
05486 rb_str_delete_bang(argc, argv, str);
05487 return str;
05488 }
05489
05490
05491
05492
05493
05494
05495
05496
05497
05498
05499 static VALUE
05500 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05501 {
05502 char squeez[TR_TABLE_SIZE];
05503 rb_encoding *enc = 0;
05504 VALUE del = 0, nodel = 0;
05505 char *s, *send, *t;
05506 int i, modify = 0;
05507 int ascompat, singlebyte = single_byte_optimizable(str);
05508 unsigned int save;
05509
05510 if (argc == 0) {
05511 enc = STR_ENC_GET(str);
05512 }
05513 else {
05514 for (i=0; i<argc; i++) {
05515 VALUE s = argv[i];
05516
05517 StringValue(s);
05518 enc = rb_enc_check(str, s);
05519 if (singlebyte && !single_byte_optimizable(s))
05520 singlebyte = 0;
05521 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05522 }
05523 }
05524
05525 str_modify_keep_cr(str);
05526 s = t = RSTRING_PTR(str);
05527 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05528 send = RSTRING_END(str);
05529 save = -1;
05530 ascompat = rb_enc_asciicompat(enc);
05531
05532 if (singlebyte) {
05533 while (s < send) {
05534 unsigned int c = *(unsigned char*)s++;
05535 if (c != save || (argc > 0 && !squeez[c])) {
05536 *t++ = save = c;
05537 }
05538 }
05539 } else {
05540 while (s < send) {
05541 unsigned int c;
05542 int clen;
05543
05544 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05545 if (c != save || (argc > 0 && !squeez[c])) {
05546 *t++ = save = c;
05547 }
05548 s++;
05549 }
05550 else {
05551 c = rb_enc_codepoint_len(s, send, &clen, enc);
05552
05553 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05554 if (t != s) rb_enc_mbcput(c, t, enc);
05555 save = c;
05556 t += clen;
05557 }
05558 s += clen;
05559 }
05560 }
05561 }
05562
05563 *t = '\0';
05564 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05565 STR_SET_LEN(str, t - RSTRING_PTR(str));
05566 modify = 1;
05567 }
05568
05569 if (modify) return str;
05570 return Qnil;
05571 }
05572
05573
05574
05575
05576
05577
05578
05579
05580
05581
05582
05583
05584
05585
05586
05587
05588
05589 static VALUE
05590 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05591 {
05592 str = rb_str_dup(str);
05593 rb_str_squeeze_bang(argc, argv, str);
05594 return str;
05595 }
05596
05597
05598
05599
05600
05601
05602
05603
05604
05605
05606 static VALUE
05607 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05608 {
05609 return tr_trans(str, src, repl, 1);
05610 }
05611
05612
05613
05614
05615
05616
05617
05618
05619
05620
05621
05622
05623
05624
05625
05626 static VALUE
05627 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05628 {
05629 str = rb_str_dup(str);
05630 tr_trans(str, src, repl, 1);
05631 return str;
05632 }
05633
05634
05635
05636
05637
05638
05639
05640
05641
05642
05643
05644
05645
05646
05647
05648
05649
05650
05651 static VALUE
05652 rb_str_count(int argc, VALUE *argv, VALUE str)
05653 {
05654 char table[TR_TABLE_SIZE];
05655 rb_encoding *enc = 0;
05656 VALUE del = 0, nodel = 0;
05657 char *s, *send;
05658 int i;
05659 int ascompat;
05660
05661 if (argc < 1) {
05662 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05663 }
05664 for (i=0; i<argc; i++) {
05665 VALUE tstr = argv[i];
05666 unsigned char c;
05667
05668 StringValue(tstr);
05669 enc = rb_enc_check(str, tstr);
05670 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05671 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05672 int n = 0;
05673
05674 s = RSTRING_PTR(str);
05675 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05676 send = RSTRING_END(str);
05677 while (s < send) {
05678 if (*(unsigned char*)s++ == c) n++;
05679 }
05680 return INT2NUM(n);
05681 }
05682 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05683 }
05684
05685 s = RSTRING_PTR(str);
05686 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05687 send = RSTRING_END(str);
05688 ascompat = rb_enc_asciicompat(enc);
05689 i = 0;
05690 while (s < send) {
05691 unsigned int c;
05692
05693 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05694 if (table[c]) {
05695 i++;
05696 }
05697 s++;
05698 }
05699 else {
05700 int clen;
05701 c = rb_enc_codepoint_len(s, send, &clen, enc);
05702 if (tr_find(c, table, del, nodel)) {
05703 i++;
05704 }
05705 s += clen;
05706 }
05707 }
05708
05709 return INT2NUM(i);
05710 }
05711
05712 static const char isspacetable[256] = {
05713 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05715 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05718 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05719 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05729 };
05730
05731 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751
05752
05753
05754
05755
05756
05757
05758
05759
05760
05761
05762
05763
05764
05765
05766
05767
05768
05769
05770
05771
05772
05773
05774
05775 static VALUE
05776 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05777 {
05778 rb_encoding *enc;
05779 VALUE spat;
05780 VALUE limit;
05781 enum {awk, string, regexp} split_type;
05782 long beg, end, i = 0;
05783 int lim = 0;
05784 VALUE result, tmp;
05785
05786 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05787 lim = NUM2INT(limit);
05788 if (lim <= 0) limit = Qnil;
05789 else if (lim == 1) {
05790 if (RSTRING_LEN(str) == 0)
05791 return rb_ary_new2(0);
05792 return rb_ary_new3(1, str);
05793 }
05794 i = 1;
05795 }
05796
05797 enc = STR_ENC_GET(str);
05798 if (NIL_P(spat)) {
05799 if (!NIL_P(rb_fs)) {
05800 spat = rb_fs;
05801 goto fs_set;
05802 }
05803 split_type = awk;
05804 }
05805 else {
05806 fs_set:
05807 if (TYPE(spat) == T_STRING) {
05808 rb_encoding *enc2 = STR_ENC_GET(spat);
05809
05810 split_type = string;
05811 if (RSTRING_LEN(spat) == 0) {
05812
05813 spat = rb_reg_regcomp(spat);
05814 split_type = regexp;
05815 }
05816 else if (rb_enc_asciicompat(enc2) == 1) {
05817 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05818 split_type = awk;
05819 }
05820 }
05821 else {
05822 int l;
05823 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05824 RSTRING_LEN(spat) == l) {
05825 split_type = awk;
05826 }
05827 }
05828 }
05829 else {
05830 spat = get_pat(spat, 1);
05831 split_type = regexp;
05832 }
05833 }
05834
05835 result = rb_ary_new();
05836 beg = 0;
05837 if (split_type == awk) {
05838 char *ptr = RSTRING_PTR(str);
05839 char *eptr = RSTRING_END(str);
05840 char *bptr = ptr;
05841 int skip = 1;
05842 unsigned int c;
05843
05844 end = beg;
05845 if (is_ascii_string(str)) {
05846 while (ptr < eptr) {
05847 c = (unsigned char)*ptr++;
05848 if (skip) {
05849 if (ascii_isspace(c)) {
05850 beg = ptr - bptr;
05851 }
05852 else {
05853 end = ptr - bptr;
05854 skip = 0;
05855 if (!NIL_P(limit) && lim <= i) break;
05856 }
05857 }
05858 else if (ascii_isspace(c)) {
05859 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05860 skip = 1;
05861 beg = ptr - bptr;
05862 if (!NIL_P(limit)) ++i;
05863 }
05864 else {
05865 end = ptr - bptr;
05866 }
05867 }
05868 }
05869 else {
05870 while (ptr < eptr) {
05871 int n;
05872
05873 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05874 ptr += n;
05875 if (skip) {
05876 if (rb_isspace(c)) {
05877 beg = ptr - bptr;
05878 }
05879 else {
05880 end = ptr - bptr;
05881 skip = 0;
05882 if (!NIL_P(limit) && lim <= i) break;
05883 }
05884 }
05885 else if (rb_isspace(c)) {
05886 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05887 skip = 1;
05888 beg = ptr - bptr;
05889 if (!NIL_P(limit)) ++i;
05890 }
05891 else {
05892 end = ptr - bptr;
05893 }
05894 }
05895 }
05896 }
05897 else if (split_type == string) {
05898 char *ptr = RSTRING_PTR(str);
05899 char *temp = ptr;
05900 char *eptr = RSTRING_END(str);
05901 char *sptr = RSTRING_PTR(spat);
05902 long slen = RSTRING_LEN(spat);
05903
05904 if (is_broken_string(str)) {
05905 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05906 }
05907 if (is_broken_string(spat)) {
05908 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05909 }
05910 enc = rb_enc_check(str, spat);
05911 while (ptr < eptr &&
05912 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05913
05914 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05915 if (t != ptr + end) {
05916 ptr = t;
05917 continue;
05918 }
05919 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05920 ptr += end + slen;
05921 if (!NIL_P(limit) && lim <= ++i) break;
05922 }
05923 beg = ptr - temp;
05924 }
05925 else {
05926 char *ptr = RSTRING_PTR(str);
05927 long len = RSTRING_LEN(str);
05928 long start = beg;
05929 long idx;
05930 int last_null = 0;
05931 struct re_registers *regs;
05932
05933 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05934 regs = RMATCH_REGS(rb_backref_get());
05935 if (start == end && BEG(0) == END(0)) {
05936 if (!ptr) {
05937 rb_ary_push(result, str_new_empty(str));
05938 break;
05939 }
05940 else if (last_null == 1) {
05941 rb_ary_push(result, rb_str_subseq(str, beg,
05942 rb_enc_fast_mbclen(ptr+beg,
05943 ptr+len,
05944 enc)));
05945 beg = start;
05946 }
05947 else {
05948 if (ptr+start == ptr+len)
05949 start++;
05950 else
05951 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05952 last_null = 1;
05953 continue;
05954 }
05955 }
05956 else {
05957 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05958 beg = start = END(0);
05959 }
05960 last_null = 0;
05961
05962 for (idx=1; idx < regs->num_regs; idx++) {
05963 if (BEG(idx) == -1) continue;
05964 if (BEG(idx) == END(idx))
05965 tmp = str_new_empty(str);
05966 else
05967 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05968 rb_ary_push(result, tmp);
05969 }
05970 if (!NIL_P(limit) && lim <= ++i) break;
05971 }
05972 }
05973 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05974 if (RSTRING_LEN(str) == beg)
05975 tmp = str_new_empty(str);
05976 else
05977 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05978 rb_ary_push(result, tmp);
05979 }
05980 if (NIL_P(limit) && lim == 0) {
05981 long len;
05982 while ((len = RARRAY_LEN(result)) > 0 &&
05983 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05984 rb_ary_pop(result);
05985 }
05986
05987 return result;
05988 }
05989
05990 VALUE
05991 rb_str_split(VALUE str, const char *sep0)
05992 {
05993 VALUE sep;
05994
05995 StringValue(str);
05996 sep = rb_str_new2(sep0);
05997 return rb_str_split_m(1, &sep, str);
05998 }
05999
06000
06001
06002
06003
06004
06005
06006
06007
06008
06009
06010
06011
06012
06013
06014
06015
06016
06017
06018
06019
06020
06021
06022
06023
06024
06025
06026
06027
06028
06029
06030
06031
06032
06033
06034
06035
06036
06037
06038 static VALUE
06039 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06040 {
06041 rb_encoding *enc;
06042 VALUE rs;
06043 unsigned int newline;
06044 const char *p, *pend, *s, *ptr;
06045 long len, rslen;
06046 VALUE line;
06047 int n;
06048 VALUE orig = str;
06049
06050 if (argc == 0) {
06051 rs = rb_rs;
06052 }
06053 else {
06054 rb_scan_args(argc, argv, "01", &rs);
06055 }
06056 RETURN_ENUMERATOR(str, argc, argv);
06057 if (NIL_P(rs)) {
06058 rb_yield(str);
06059 return orig;
06060 }
06061 str = rb_str_new4(str);
06062 ptr = p = s = RSTRING_PTR(str);
06063 pend = p + RSTRING_LEN(str);
06064 len = RSTRING_LEN(str);
06065 StringValue(rs);
06066 if (rs == rb_default_rs) {
06067 enc = rb_enc_get(str);
06068 while (p < pend) {
06069 char *p0;
06070
06071 p = memchr(p, '\n', pend - p);
06072 if (!p) break;
06073 p0 = rb_enc_left_char_head(s, p, pend, enc);
06074 if (!rb_enc_is_newline(p0, pend, enc)) {
06075 p++;
06076 continue;
06077 }
06078 p = p0 + rb_enc_mbclen(p0, pend, enc);
06079 line = rb_str_new5(str, s, p - s);
06080 OBJ_INFECT(line, str);
06081 rb_enc_cr_str_copy_for_substr(line, str);
06082 rb_yield(line);
06083 str_mod_check(str, ptr, len);
06084 s = p;
06085 }
06086 goto finish;
06087 }
06088
06089 enc = rb_enc_check(str, rs);
06090 rslen = RSTRING_LEN(rs);
06091 if (rslen == 0) {
06092 newline = '\n';
06093 }
06094 else {
06095 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06096 }
06097
06098 while (p < pend) {
06099 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06100
06101 again:
06102 if (rslen == 0 && c == newline) {
06103 p += n;
06104 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06105 goto again;
06106 }
06107 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06108 p += n;
06109 }
06110 p -= n;
06111 }
06112 if (c == newline &&
06113 (rslen <= 1 ||
06114 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06115 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06116 OBJ_INFECT(line, str);
06117 rb_enc_cr_str_copy_for_substr(line, str);
06118 rb_yield(line);
06119 str_mod_check(str, ptr, len);
06120 s = p + (rslen ? rslen : n);
06121 }
06122 p += n;
06123 }
06124
06125 finish:
06126 if (s != pend) {
06127 line = rb_str_new5(str, s, pend - s);
06128 OBJ_INFECT(line, str);
06129 rb_enc_cr_str_copy_for_substr(line, str);
06130 rb_yield(line);
06131 }
06132
06133 return orig;
06134 }
06135
06136
06137
06138
06139
06140
06141
06142
06143
06144
06145
06146
06147
06148
06149
06150
06151
06152
06153
06154
06155 static VALUE
06156 rb_str_each_byte(VALUE str)
06157 {
06158 long i;
06159
06160 RETURN_ENUMERATOR(str, 0, 0);
06161 for (i=0; i<RSTRING_LEN(str); i++) {
06162 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06163 }
06164 return str;
06165 }
06166
06167
06168
06169
06170
06171
06172
06173
06174
06175
06176
06177
06178
06179
06180
06181
06182
06183
06184
06185
06186 static VALUE
06187 rb_str_each_char(VALUE str)
06188 {
06189 VALUE orig = str;
06190 long i, len, n;
06191 const char *ptr;
06192 rb_encoding *enc;
06193
06194 RETURN_ENUMERATOR(str, 0, 0);
06195 str = rb_str_new4(str);
06196 ptr = RSTRING_PTR(str);
06197 len = RSTRING_LEN(str);
06198 enc = rb_enc_get(str);
06199 switch (ENC_CODERANGE(str)) {
06200 case ENC_CODERANGE_VALID:
06201 case ENC_CODERANGE_7BIT:
06202 for (i = 0; i < len; i += n) {
06203 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06204 rb_yield(rb_str_subseq(str, i, n));
06205 }
06206 break;
06207 default:
06208 for (i = 0; i < len; i += n) {
06209 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06210 rb_yield(rb_str_subseq(str, i, n));
06211 }
06212 }
06213 return orig;
06214 }
06215
06216
06217
06218
06219
06220
06221
06222
06223
06224
06225
06226
06227
06228
06229
06230
06231
06232
06233
06234
06235
06236
06237 static VALUE
06238 rb_str_each_codepoint(VALUE str)
06239 {
06240 VALUE orig = str;
06241 int n;
06242 unsigned int c;
06243 const char *ptr, *end;
06244 rb_encoding *enc;
06245
06246 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06247 RETURN_ENUMERATOR(str, 0, 0);
06248 str = rb_str_new4(str);
06249 ptr = RSTRING_PTR(str);
06250 end = RSTRING_END(str);
06251 enc = STR_ENC_GET(str);
06252 while (ptr < end) {
06253 c = rb_enc_codepoint_len(ptr, end, &n, enc);
06254 rb_yield(UINT2NUM(c));
06255 ptr += n;
06256 }
06257 return orig;
06258 }
06259
06260 static long
06261 chopped_length(VALUE str)
06262 {
06263 rb_encoding *enc = STR_ENC_GET(str);
06264 const char *p, *p2, *beg, *end;
06265
06266 beg = RSTRING_PTR(str);
06267 end = beg + RSTRING_LEN(str);
06268 if (beg > end) return 0;
06269 p = rb_enc_prev_char(beg, end, end, enc);
06270 if (!p) return 0;
06271 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06272 p2 = rb_enc_prev_char(beg, p, end, enc);
06273 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06274 }
06275 return p - beg;
06276 }
06277
06278
06279
06280
06281
06282
06283
06284
06285
06286
06287 static VALUE
06288 rb_str_chop_bang(VALUE str)
06289 {
06290 str_modify_keep_cr(str);
06291 if (RSTRING_LEN(str) > 0) {
06292 long len;
06293 len = chopped_length(str);
06294 STR_SET_LEN(str, len);
06295 RSTRING_PTR(str)[len] = '\0';
06296 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06297 ENC_CODERANGE_CLEAR(str);
06298 }
06299 return str;
06300 }
06301 return Qnil;
06302 }
06303
06304
06305
06306
06307
06308
06309
06310
06311
06312
06313
06314
06315
06316
06317
06318
06319
06320
06321
06322 static VALUE
06323 rb_str_chop(VALUE str)
06324 {
06325 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06326 rb_enc_cr_str_copy_for_substr(str2, str);
06327 OBJ_INFECT(str2, str);
06328 return str2;
06329 }
06330
06331
06332
06333
06334
06335
06336
06337
06338
06339
06340 static VALUE
06341 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06342 {
06343 rb_encoding *enc;
06344 VALUE rs;
06345 int newline;
06346 char *p, *pp, *e;
06347 long len, rslen;
06348
06349 str_modify_keep_cr(str);
06350 len = RSTRING_LEN(str);
06351 if (len == 0) return Qnil;
06352 p = RSTRING_PTR(str);
06353 e = p + len;
06354 if (argc == 0) {
06355 rs = rb_rs;
06356 if (rs == rb_default_rs) {
06357 smart_chomp:
06358 enc = rb_enc_get(str);
06359 if (rb_enc_mbminlen(enc) > 1) {
06360 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06361 if (rb_enc_is_newline(pp, e, enc)) {
06362 e = pp;
06363 }
06364 pp = e - rb_enc_mbminlen(enc);
06365 if (pp >= p) {
06366 pp = rb_enc_left_char_head(p, pp, e, enc);
06367 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06368 e = pp;
06369 }
06370 }
06371 if (e == RSTRING_END(str)) {
06372 return Qnil;
06373 }
06374 len = e - RSTRING_PTR(str);
06375 STR_SET_LEN(str, len);
06376 }
06377 else {
06378 if (RSTRING_PTR(str)[len-1] == '\n') {
06379 STR_DEC_LEN(str);
06380 if (RSTRING_LEN(str) > 0 &&
06381 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06382 STR_DEC_LEN(str);
06383 }
06384 }
06385 else if (RSTRING_PTR(str)[len-1] == '\r') {
06386 STR_DEC_LEN(str);
06387 }
06388 else {
06389 return Qnil;
06390 }
06391 }
06392 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06393 return str;
06394 }
06395 }
06396 else {
06397 rb_scan_args(argc, argv, "01", &rs);
06398 }
06399 if (NIL_P(rs)) return Qnil;
06400 StringValue(rs);
06401 rslen = RSTRING_LEN(rs);
06402 if (rslen == 0) {
06403 while (len>0 && p[len-1] == '\n') {
06404 len--;
06405 if (len>0 && p[len-1] == '\r')
06406 len--;
06407 }
06408 if (len < RSTRING_LEN(str)) {
06409 STR_SET_LEN(str, len);
06410 RSTRING_PTR(str)[len] = '\0';
06411 return str;
06412 }
06413 return Qnil;
06414 }
06415 if (rslen > len) return Qnil;
06416 newline = RSTRING_PTR(rs)[rslen-1];
06417 if (rslen == 1 && newline == '\n')
06418 goto smart_chomp;
06419
06420 enc = rb_enc_check(str, rs);
06421 if (is_broken_string(rs)) {
06422 return Qnil;
06423 }
06424 pp = e - rslen;
06425 if (p[len-1] == newline &&
06426 (rslen <= 1 ||
06427 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06428 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06429 return Qnil;
06430 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06431 ENC_CODERANGE_CLEAR(str);
06432 }
06433 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06434 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06435 return str;
06436 }
06437 return Qnil;
06438 }
06439
06440
06441
06442
06443
06444
06445
06446
06447
06448
06449
06450
06451
06452
06453
06454
06455
06456
06457
06458
06459
06460 static VALUE
06461 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06462 {
06463 str = rb_str_dup(str);
06464 rb_str_chomp_bang(argc, argv, str);
06465 return str;
06466 }
06467
06468
06469
06470
06471
06472
06473
06474
06475
06476
06477
06478
06479
06480 static VALUE
06481 rb_str_lstrip_bang(VALUE str)
06482 {
06483 rb_encoding *enc;
06484 char *s, *t, *e;
06485
06486 str_modify_keep_cr(str);
06487 enc = STR_ENC_GET(str);
06488 s = RSTRING_PTR(str);
06489 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06490 e = t = RSTRING_END(str);
06491
06492 while (s < e) {
06493 int n;
06494 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06495
06496 if (!rb_isspace(cc)) break;
06497 s += n;
06498 }
06499
06500 if (s > RSTRING_PTR(str)) {
06501 STR_SET_LEN(str, t-s);
06502 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06503 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06504 return str;
06505 }
06506 return Qnil;
06507 }
06508
06509
06510
06511
06512
06513
06514
06515
06516
06517
06518
06519
06520
06521 static VALUE
06522 rb_str_lstrip(VALUE str)
06523 {
06524 str = rb_str_dup(str);
06525 rb_str_lstrip_bang(str);
06526 return str;
06527 }
06528
06529
06530
06531
06532
06533
06534
06535
06536
06537
06538
06539
06540
06541
06542 static VALUE
06543 rb_str_rstrip_bang(VALUE str)
06544 {
06545 rb_encoding *enc;
06546 char *s, *t, *e;
06547
06548 str_modify_keep_cr(str);
06549 enc = STR_ENC_GET(str);
06550 rb_str_check_dummy_enc(enc);
06551 s = RSTRING_PTR(str);
06552 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06553 t = e = RSTRING_END(str);
06554
06555
06556 if (single_byte_optimizable(str)) {
06557 unsigned char c;
06558 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06559 }
06560 else {
06561 char *tp;
06562
06563 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06564 unsigned int c = rb_enc_codepoint(tp, e, enc);
06565 if (c && !rb_isspace(c)) break;
06566 t = tp;
06567 }
06568 }
06569 if (t < e) {
06570 long len = t-RSTRING_PTR(str);
06571
06572 STR_SET_LEN(str, len);
06573 RSTRING_PTR(str)[len] = '\0';
06574 return str;
06575 }
06576 return Qnil;
06577 }
06578
06579
06580
06581
06582
06583
06584
06585
06586
06587
06588
06589
06590
06591 static VALUE
06592 rb_str_rstrip(VALUE str)
06593 {
06594 str = rb_str_dup(str);
06595 rb_str_rstrip_bang(str);
06596 return str;
06597 }
06598
06599
06600
06601
06602
06603
06604
06605
06606
06607
06608 static VALUE
06609 rb_str_strip_bang(VALUE str)
06610 {
06611 VALUE l = rb_str_lstrip_bang(str);
06612 VALUE r = rb_str_rstrip_bang(str);
06613
06614 if (NIL_P(l) && NIL_P(r)) return Qnil;
06615 return str;
06616 }
06617
06618
06619
06620
06621
06622
06623
06624
06625
06626
06627
06628
06629 static VALUE
06630 rb_str_strip(VALUE str)
06631 {
06632 str = rb_str_dup(str);
06633 rb_str_strip_bang(str);
06634 return str;
06635 }
06636
06637 static VALUE
06638 scan_once(VALUE str, VALUE pat, long *start)
06639 {
06640 VALUE result, match;
06641 struct re_registers *regs;
06642 int i;
06643
06644 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06645 match = rb_backref_get();
06646 regs = RMATCH_REGS(match);
06647 if (BEG(0) == END(0)) {
06648 rb_encoding *enc = STR_ENC_GET(str);
06649
06650
06651
06652 if (RSTRING_LEN(str) > END(0))
06653 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06654 RSTRING_END(str), enc);
06655 else
06656 *start = END(0)+1;
06657 }
06658 else {
06659 *start = END(0);
06660 }
06661 if (regs->num_regs == 1) {
06662 return rb_reg_nth_match(0, match);
06663 }
06664 result = rb_ary_new2(regs->num_regs);
06665 for (i=1; i < regs->num_regs; i++) {
06666 rb_ary_push(result, rb_reg_nth_match(i, match));
06667 }
06668
06669 return result;
06670 }
06671 return Qnil;
06672 }
06673
06674
06675
06676
06677
06678
06679
06680
06681
06682
06683
06684
06685
06686
06687
06688
06689
06690
06691
06692
06693
06694
06695
06696
06697
06698
06699
06700
06701
06702
06703
06704
06705
06706 static VALUE
06707 rb_str_scan(VALUE str, VALUE pat)
06708 {
06709 VALUE result;
06710 long start = 0;
06711 long last = -1, prev = 0;
06712 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06713
06714 pat = get_pat(pat, 1);
06715 if (!rb_block_given_p()) {
06716 VALUE ary = rb_ary_new();
06717
06718 while (!NIL_P(result = scan_once(str, pat, &start))) {
06719 last = prev;
06720 prev = start;
06721 rb_ary_push(ary, result);
06722 }
06723 if (last >= 0) rb_reg_search(pat, str, last, 0);
06724 return ary;
06725 }
06726
06727 while (!NIL_P(result = scan_once(str, pat, &start))) {
06728 last = prev;
06729 prev = start;
06730 rb_yield(result);
06731 str_mod_check(str, p, len);
06732 }
06733 if (last >= 0) rb_reg_search(pat, str, last, 0);
06734 return str;
06735 }
06736
06737
06738
06739
06740
06741
06742
06743
06744
06745
06746
06747
06748
06749
06750
06751
06752 static VALUE
06753 rb_str_hex(VALUE str)
06754 {
06755 rb_encoding *enc = rb_enc_get(str);
06756
06757 if (!rb_enc_asciicompat(enc)) {
06758 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06759 }
06760 return rb_str_to_inum(str, 16, FALSE);
06761 }
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772
06773
06774
06775
06776
06777
06778 static VALUE
06779 rb_str_oct(VALUE str)
06780 {
06781 rb_encoding *enc = rb_enc_get(str);
06782
06783 if (!rb_enc_asciicompat(enc)) {
06784 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06785 }
06786 return rb_str_to_inum(str, -8, FALSE);
06787 }
06788
06789
06790
06791
06792
06793
06794
06795
06796
06797
06798
06799
06800 static VALUE
06801 rb_str_crypt(VALUE str, VALUE salt)
06802 {
06803 extern char *crypt(const char *, const char *);
06804 VALUE result;
06805 const char *s, *saltp;
06806 char *res;
06807 #ifdef BROKEN_CRYPT
06808 char salt_8bit_clean[3];
06809 #endif
06810
06811 StringValue(salt);
06812 if (RSTRING_LEN(salt) < 2)
06813 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06814
06815 s = RSTRING_PTR(str);
06816 if (!s) s = "";
06817 saltp = RSTRING_PTR(salt);
06818 #ifdef BROKEN_CRYPT
06819 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06820 salt_8bit_clean[0] = saltp[0] & 0x7f;
06821 salt_8bit_clean[1] = saltp[1] & 0x7f;
06822 salt_8bit_clean[2] = '\0';
06823 saltp = salt_8bit_clean;
06824 }
06825 #endif
06826 res = crypt(s, saltp);
06827 if (!res) {
06828 rb_sys_fail("crypt");
06829 }
06830 result = rb_str_new2(res);
06831 OBJ_INFECT(result, str);
06832 OBJ_INFECT(result, salt);
06833 return result;
06834 }
06835
06836
06837
06838
06839
06840
06841
06842
06843
06844
06845
06846
06847
06848
06849
06850
06851
06852
06853
06854
06855
06856
06857 VALUE
06858 rb_str_intern(VALUE s)
06859 {
06860 VALUE str = RB_GC_GUARD(s);
06861 ID id;
06862
06863 id = rb_intern_str(str);
06864 return ID2SYM(id);
06865 }
06866
06867
06868
06869
06870
06871
06872
06873
06874
06875
06876
06877 VALUE
06878 rb_str_ord(VALUE s)
06879 {
06880 unsigned int c;
06881
06882 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06883 return UINT2NUM(c);
06884 }
06885
06886
06887
06888
06889
06890
06891
06892
06893
06894
06895
06896 static VALUE
06897 rb_str_sum(int argc, VALUE *argv, VALUE str)
06898 {
06899 VALUE vbits;
06900 int bits;
06901 char *ptr, *p, *pend;
06902 long len;
06903 VALUE sum = INT2FIX(0);
06904 unsigned long sum0 = 0;
06905
06906 if (argc == 0) {
06907 bits = 16;
06908 }
06909 else {
06910 rb_scan_args(argc, argv, "01", &vbits);
06911 bits = NUM2INT(vbits);
06912 }
06913 ptr = p = RSTRING_PTR(str);
06914 len = RSTRING_LEN(str);
06915 pend = p + len;
06916
06917 while (p < pend) {
06918 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06919 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06920 str_mod_check(str, ptr, len);
06921 sum0 = 0;
06922 }
06923 sum0 += (unsigned char)*p;
06924 p++;
06925 }
06926
06927 if (bits == 0) {
06928 if (sum0) {
06929 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06930 }
06931 }
06932 else {
06933 if (sum == INT2FIX(0)) {
06934 if (bits < (int)sizeof(long)*CHAR_BIT) {
06935 sum0 &= (((unsigned long)1)<<bits)-1;
06936 }
06937 sum = LONG2FIX(sum0);
06938 }
06939 else {
06940 VALUE mod;
06941
06942 if (sum0) {
06943 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06944 }
06945
06946 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06947 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06948 sum = rb_funcall(sum, '&', 1, mod);
06949 }
06950 }
06951 return sum;
06952 }
06953
06954 static VALUE
06955 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06956 {
06957 rb_encoding *enc;
06958 VALUE w;
06959 long width, len, flen = 1, fclen = 1;
06960 VALUE res;
06961 char *p;
06962 const char *f = " ";
06963 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06964 volatile VALUE pad;
06965 int singlebyte = 1, cr;
06966
06967 rb_scan_args(argc, argv, "11", &w, &pad);
06968 enc = STR_ENC_GET(str);
06969 width = NUM2LONG(w);
06970 if (argc == 2) {
06971 StringValue(pad);
06972 enc = rb_enc_check(str, pad);
06973 f = RSTRING_PTR(pad);
06974 flen = RSTRING_LEN(pad);
06975 fclen = str_strlen(pad, enc);
06976 singlebyte = single_byte_optimizable(pad);
06977 if (flen == 0 || fclen == 0) {
06978 rb_raise(rb_eArgError, "zero width padding");
06979 }
06980 }
06981 len = str_strlen(str, enc);
06982 if (width < 0 || len >= width) return rb_str_dup(str);
06983 n = width - len;
06984 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06985 rlen = n - llen;
06986 cr = ENC_CODERANGE(str);
06987 if (flen > 1) {
06988 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06989 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06990 }
06991 size = RSTRING_LEN(str);
06992 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06993 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06994 (len += llen2 + rlen2) >= LONG_MAX - size) {
06995 rb_raise(rb_eArgError, "argument too big");
06996 }
06997 len += size;
06998 res = rb_str_new5(str, 0, len);
06999 p = RSTRING_PTR(res);
07000 if (flen <= 1) {
07001 memset(p, *f, llen);
07002 p += llen;
07003 }
07004 else {
07005 while (llen >= fclen) {
07006 memcpy(p,f,flen);
07007 p += flen;
07008 llen -= fclen;
07009 }
07010 if (llen > 0) {
07011 memcpy(p, f, llen2);
07012 p += llen2;
07013 }
07014 }
07015 memcpy(p, RSTRING_PTR(str), size);
07016 p += size;
07017 if (flen <= 1) {
07018 memset(p, *f, rlen);
07019 p += rlen;
07020 }
07021 else {
07022 while (rlen >= fclen) {
07023 memcpy(p,f,flen);
07024 p += flen;
07025 rlen -= fclen;
07026 }
07027 if (rlen > 0) {
07028 memcpy(p, f, rlen2);
07029 p += rlen2;
07030 }
07031 }
07032 *p = '\0';
07033 STR_SET_LEN(res, p-RSTRING_PTR(res));
07034 OBJ_INFECT(res, str);
07035 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07036 rb_enc_associate(res, enc);
07037 if (argc == 2)
07038 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07039 if (cr != ENC_CODERANGE_BROKEN)
07040 ENC_CODERANGE_SET(res, cr);
07041 return res;
07042 }
07043
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058 static VALUE
07059 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07060 {
07061 return rb_str_justify(argc, argv, str, 'l');
07062 }
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074
07075
07076
07077
07078 static VALUE
07079 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07080 {
07081 return rb_str_justify(argc, argv, str, 'r');
07082 }
07083
07084
07085
07086
07087
07088
07089
07090
07091
07092
07093
07094
07095
07096
07097
07098 static VALUE
07099 rb_str_center(int argc, VALUE *argv, VALUE str)
07100 {
07101 return rb_str_justify(argc, argv, str, 'c');
07102 }
07103
07104
07105
07106
07107
07108
07109
07110
07111
07112
07113
07114
07115
07116
07117
07118
07119 static VALUE
07120 rb_str_partition(VALUE str, VALUE sep)
07121 {
07122 long pos;
07123 int regex = FALSE;
07124
07125 if (TYPE(sep) == T_REGEXP) {
07126 pos = rb_reg_search(sep, str, 0, 0);
07127 regex = TRUE;
07128 }
07129 else {
07130 VALUE tmp;
07131
07132 tmp = rb_check_string_type(sep);
07133 if (NIL_P(tmp)) {
07134 rb_raise(rb_eTypeError, "type mismatch: %s given",
07135 rb_obj_classname(sep));
07136 }
07137 sep = tmp;
07138 pos = rb_str_index(str, sep, 0);
07139 }
07140 if (pos < 0) {
07141 failed:
07142 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07143 }
07144 if (regex) {
07145 sep = rb_str_subpat(str, sep, INT2FIX(0));
07146 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07147 }
07148 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07149 sep,
07150 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07151 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07152 }
07153
07154
07155
07156
07157
07158
07159
07160
07161
07162
07163
07164
07165
07166
07167
07168
07169 static VALUE
07170 rb_str_rpartition(VALUE str, VALUE sep)
07171 {
07172 long pos = RSTRING_LEN(str);
07173 int regex = FALSE;
07174
07175 if (TYPE(sep) == T_REGEXP) {
07176 pos = rb_reg_search(sep, str, pos, 1);
07177 regex = TRUE;
07178 }
07179 else {
07180 VALUE tmp;
07181
07182 tmp = rb_check_string_type(sep);
07183 if (NIL_P(tmp)) {
07184 rb_raise(rb_eTypeError, "type mismatch: %s given",
07185 rb_obj_classname(sep));
07186 }
07187 sep = tmp;
07188 pos = rb_str_sublen(str, pos);
07189 pos = rb_str_rindex(str, sep, pos);
07190 }
07191 if (pos < 0) {
07192 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07193 }
07194 if (regex) {
07195 sep = rb_reg_nth_match(0, rb_backref_get());
07196 }
07197 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07198 sep,
07199 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07200 }
07201
07202
07203
07204
07205
07206
07207
07208
07209
07210
07211
07212
07213
07214
07215
07216
07217
07218 static VALUE
07219 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07220 {
07221 int i;
07222
07223 for (i=0; i<argc; i++) {
07224 VALUE tmp = rb_check_string_type(argv[i]);
07225 if (NIL_P(tmp)) continue;
07226 rb_enc_check(str, tmp);
07227 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07228 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07229 return Qtrue;
07230 }
07231 return Qfalse;
07232 }
07233
07234
07235
07236
07237
07238
07239
07240
07241 static VALUE
07242 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07243 {
07244 int i;
07245 char *p, *s, *e;
07246 rb_encoding *enc;
07247
07248 for (i=0; i<argc; i++) {
07249 VALUE tmp = rb_check_string_type(argv[i]);
07250 if (NIL_P(tmp)) continue;
07251 enc = rb_enc_check(str, tmp);
07252 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07253 p = RSTRING_PTR(str);
07254 e = p + RSTRING_LEN(str);
07255 s = e - RSTRING_LEN(tmp);
07256 if (rb_enc_left_char_head(p, s, e, enc) != s)
07257 continue;
07258 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07259 return Qtrue;
07260 }
07261 return Qfalse;
07262 }
07263
07264 void
07265 rb_str_setter(VALUE val, ID id, VALUE *var)
07266 {
07267 if (!NIL_P(val) && TYPE(val) != T_STRING) {
07268 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07269 }
07270 *var = val;
07271 }
07272
07273
07274
07275
07276
07277
07278
07279
07280
07281 static VALUE
07282 rb_str_force_encoding(VALUE str, VALUE enc)
07283 {
07284 str_modifiable(str);
07285 rb_enc_associate(str, rb_to_encoding(enc));
07286 ENC_CODERANGE_CLEAR(str);
07287 return str;
07288 }
07289
07290
07291
07292
07293
07294
07295
07296
07297
07298
07299
07300
07301 static VALUE
07302 rb_str_valid_encoding_p(VALUE str)
07303 {
07304 int cr = rb_enc_str_coderange(str);
07305
07306 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07307 }
07308
07309
07310
07311
07312
07313
07314
07315
07316
07317
07318
07319 static VALUE
07320 rb_str_is_ascii_only_p(VALUE str)
07321 {
07322 int cr = rb_enc_str_coderange(str);
07323
07324 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07325 }
07326
07341 VALUE
07342 rb_str_ellipsize(VALUE str, long len)
07343 {
07344 static const char ellipsis[] = "...";
07345 const long ellipsislen = sizeof(ellipsis) - 1;
07346 rb_encoding *const enc = rb_enc_get(str);
07347 const long blen = RSTRING_LEN(str);
07348 const char *const p = RSTRING_PTR(str), *e = p + blen;
07349 VALUE estr, ret = 0;
07350
07351 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07352 if (len * rb_enc_mbminlen(enc) >= blen ||
07353 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07354 ret = str;
07355 }
07356 else if (len <= ellipsislen ||
07357 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07358 if (rb_enc_asciicompat(enc)) {
07359 ret = rb_str_new_with_class(str, ellipsis, len);
07360 rb_enc_associate(ret, enc);
07361 }
07362 else {
07363 estr = rb_usascii_str_new(ellipsis, len);
07364 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07365 }
07366 }
07367 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07368 rb_str_cat(ret, ellipsis, ellipsislen);
07369 }
07370 else {
07371 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07372 rb_enc_from_encoding(enc), 0, Qnil);
07373 rb_str_append(ret, estr);
07374 }
07375 return ret;
07376 }
07377
07378
07379
07380
07381
07382
07383
07384
07385
07386
07387
07388
07389
07390
07391
07392
07393
07394
07395
07396
07397
07398
07399
07400
07401
07402
07403
07404
07405
07406
07407
07408
07409
07410
07411
07412
07413
07414
07415
07416
07417
07418
07419
07420 static VALUE
07421 sym_equal(VALUE sym1, VALUE sym2)
07422 {
07423 if (sym1 == sym2) return Qtrue;
07424 return Qfalse;
07425 }
07426
07427
07428 static int
07429 sym_printable(const char *s, const char *send, rb_encoding *enc)
07430 {
07431 while (s < send) {
07432 int n;
07433 int c = rb_enc_codepoint_len(s, send, &n, enc);
07434
07435 if (!rb_enc_isprint(c, enc)) return FALSE;
07436 s += n;
07437 }
07438 return TRUE;
07439 }
07440
07441
07442
07443
07444
07445
07446
07447
07448
07449
07450 static VALUE
07451 sym_inspect(VALUE sym)
07452 {
07453 VALUE str;
07454 ID id = SYM2ID(sym);
07455 rb_encoding *enc;
07456 const char *ptr;
07457 long len;
07458 char *dest;
07459 rb_encoding *resenc = rb_default_internal_encoding();
07460
07461 if (resenc == NULL) resenc = rb_default_external_encoding();
07462 sym = rb_id2str(id);
07463 enc = STR_ENC_GET(sym);
07464 ptr = RSTRING_PTR(sym);
07465 len = RSTRING_LEN(sym);
07466 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07467 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07468 str = rb_str_inspect(sym);
07469 len = RSTRING_LEN(str);
07470 rb_str_resize(str, len + 1);
07471 dest = RSTRING_PTR(str);
07472 memmove(dest + 1, dest, len);
07473 dest[0] = ':';
07474 }
07475 else {
07476 char *dest;
07477 str = rb_enc_str_new(0, len + 1, enc);
07478 dest = RSTRING_PTR(str);
07479 dest[0] = ':';
07480 memcpy(dest + 1, ptr, len);
07481 }
07482 return str;
07483 }
07484
07485
07486
07487
07488
07489
07490
07491
07492
07493
07494
07495
07496
07497 VALUE
07498 rb_sym_to_s(VALUE sym)
07499 {
07500 ID id = SYM2ID(sym);
07501
07502 return str_new3(rb_cString, rb_id2str(id));
07503 }
07504
07505
07506
07507
07508
07509
07510
07511
07512
07513
07514
07515
07516 static VALUE
07517 sym_to_sym(VALUE sym)
07518 {
07519 return sym;
07520 }
07521
07522 static VALUE
07523 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07524 {
07525 VALUE obj;
07526
07527 if (argc < 1) {
07528 rb_raise(rb_eArgError, "no receiver given");
07529 }
07530 obj = argv[0];
07531 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07532 }
07533
07534
07535
07536
07537
07538
07539
07540
07541
07542
07543 static VALUE
07544 sym_to_proc(VALUE sym)
07545 {
07546 static VALUE sym_proc_cache = Qfalse;
07547 enum {SYM_PROC_CACHE_SIZE = 67};
07548 VALUE proc;
07549 long id, index;
07550 VALUE *aryp;
07551
07552 if (!sym_proc_cache) {
07553 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07554 rb_gc_register_mark_object(sym_proc_cache);
07555 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07556 }
07557
07558 id = SYM2ID(sym);
07559 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07560
07561 aryp = RARRAY_PTR(sym_proc_cache);
07562 if (aryp[index] == sym) {
07563 return aryp[index + 1];
07564 }
07565 else {
07566 proc = rb_proc_new(sym_call, (VALUE)id);
07567 aryp[index] = sym;
07568 aryp[index + 1] = proc;
07569 return proc;
07570 }
07571 }
07572
07573
07574
07575
07576
07577
07578
07579
07580
07581 static VALUE
07582 sym_succ(VALUE sym)
07583 {
07584 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07585 }
07586
07587
07588
07589
07590
07591
07592
07593
07594
07595 static VALUE
07596 sym_cmp(VALUE sym, VALUE other)
07597 {
07598 if (!SYMBOL_P(other)) {
07599 return Qnil;
07600 }
07601 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07602 }
07603
07604
07605
07606
07607
07608
07609
07610
07611
07612 static VALUE
07613 sym_casecmp(VALUE sym, VALUE other)
07614 {
07615 if (!SYMBOL_P(other)) {
07616 return Qnil;
07617 }
07618 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07619 }
07620
07621
07622
07623
07624
07625
07626
07627
07628 static VALUE
07629 sym_match(VALUE sym, VALUE other)
07630 {
07631 return rb_str_match(rb_sym_to_s(sym), other);
07632 }
07633
07634
07635
07636
07637
07638
07639
07640
07641
07642 static VALUE
07643 sym_aref(int argc, VALUE *argv, VALUE sym)
07644 {
07645 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07646 }
07647
07648
07649
07650
07651
07652
07653
07654
07655 static VALUE
07656 sym_length(VALUE sym)
07657 {
07658 return rb_str_length(rb_id2str(SYM2ID(sym)));
07659 }
07660
07661
07662
07663
07664
07665
07666
07667
07668 static VALUE
07669 sym_empty(VALUE sym)
07670 {
07671 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07672 }
07673
07674
07675
07676
07677
07678
07679
07680
07681 static VALUE
07682 sym_upcase(VALUE sym)
07683 {
07684 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07685 }
07686
07687
07688
07689
07690
07691
07692
07693
07694 static VALUE
07695 sym_downcase(VALUE sym)
07696 {
07697 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07698 }
07699
07700
07701
07702
07703
07704
07705
07706
07707 static VALUE
07708 sym_capitalize(VALUE sym)
07709 {
07710 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07711 }
07712
07713
07714
07715
07716
07717
07718
07719
07720 static VALUE
07721 sym_swapcase(VALUE sym)
07722 {
07723 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07724 }
07725
07726
07727
07728
07729
07730
07731
07732
07733 static VALUE
07734 sym_encoding(VALUE sym)
07735 {
07736 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07737 }
07738
07739 ID
07740 rb_to_id(VALUE name)
07741 {
07742 VALUE tmp;
07743
07744 switch (TYPE(name)) {
07745 default:
07746 tmp = rb_check_string_type(name);
07747 if (NIL_P(tmp)) {
07748 tmp = rb_inspect(name);
07749 rb_raise(rb_eTypeError, "%s is not a symbol",
07750 RSTRING_PTR(tmp));
07751 }
07752 name = tmp;
07753
07754 case T_STRING:
07755 name = rb_str_intern(name);
07756
07757 case T_SYMBOL:
07758 return SYM2ID(name);
07759 }
07760 return Qnil;
07761 }
07762
07763
07764
07765
07766
07767
07768
07769
07770
07771
07772
07773
07774
07775
07776 void
07777 Init_String(void)
07778 {
07779 #undef rb_intern
07780 #define rb_intern(str) rb_intern_const(str)
07781
07782 rb_cString = rb_define_class("String", rb_cObject);
07783 rb_include_module(rb_cString, rb_mComparable);
07784 rb_define_alloc_func(rb_cString, str_alloc);
07785 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07786 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07787 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07788 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07789 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07790 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07791 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07792 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07793 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07794 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07795 rb_define_method(rb_cString, "*", rb_str_times, 1);
07796 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07797 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07798 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07799 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07800 rb_define_method(rb_cString, "length", rb_str_length, 0);
07801 rb_define_method(rb_cString, "size", rb_str_length, 0);
07802 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07803 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07804 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07805 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07806 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07807 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07808 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07809 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07810 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07811 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07812 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07813 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07814 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07815 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07816 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07817 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07818 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07819
07820 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07821 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07822 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07823 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07824 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07825 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07826
07827 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07828 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07829 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07830 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07831
07832 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07833 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07834 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07835 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07836
07837 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07838 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07839 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07840 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07841 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07842 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07843 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07844 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07845 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07846 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07847 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07848 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07849 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07850 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07851 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07852 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07853
07854 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07855 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07856 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07857
07858 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07859
07860 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07861 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07862 rb_define_method(rb_cString, "center", rb_str_center, -1);
07863
07864 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07865 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07866 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07867 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07868 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07869 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07870 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07871
07872 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07873 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07874 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07875 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07876 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07877 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07878 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07879
07880 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07881 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07882 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07883 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07884 rb_define_method(rb_cString, "count", rb_str_count, -1);
07885
07886 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07887 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07888 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07889 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07890
07891 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07892 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07893 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07894 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07895
07896 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07897
07898 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07899 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07900
07901 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07902 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07903
07904 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07905 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07906 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07907 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07908
07909 id_to_s = rb_intern("to_s");
07910
07911 rb_fs = Qnil;
07912 rb_define_variable("$;", &rb_fs);
07913 rb_define_variable("$-F", &rb_fs);
07914
07915 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07916 rb_include_module(rb_cSymbol, rb_mComparable);
07917 rb_undef_alloc_func(rb_cSymbol);
07918 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07919 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07920
07921 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07922 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07923 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07924 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07925 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07926 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07927 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07928 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07929 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07930 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07931
07932 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07933 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07934 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07935
07936 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07937 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07938 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07939 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07940 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07941 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07942
07943 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07944 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07945 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07946 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07947
07948 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07949 }
07950