00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "transcode_data.h"
00016 #include <ctype.h>
00017
00018 #define ENABLE_ECONV_NEWLINE_OPTION 1
00019
00020
00021 VALUE rb_eUndefinedConversionError;
00022 VALUE rb_eInvalidByteSequenceError;
00023 VALUE rb_eConverterNotFoundError;
00024
00025 VALUE rb_cEncodingConverter;
00026
00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
00028 static VALUE sym_xml, sym_text, sym_attr;
00029 static VALUE sym_universal_newline;
00030 static VALUE sym_crlf_newline;
00031 static VALUE sym_cr_newline;
00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION
00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
00034 #endif
00035 static VALUE sym_partial_input;
00036
00037 static VALUE sym_invalid_byte_sequence;
00038 static VALUE sym_undefined_conversion;
00039 static VALUE sym_destination_buffer_full;
00040 static VALUE sym_source_buffer_empty;
00041 static VALUE sym_finished;
00042 static VALUE sym_after_output;
00043 static VALUE sym_incomplete_input;
00044
00045 static unsigned char *
00046 allocate_converted_string(const char *sname, const char *dname,
00047 const unsigned char *str, size_t len,
00048 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00049 size_t *dst_len_ptr);
00050
00051
00052
00053 typedef struct rb_transcoding {
00054 const rb_transcoder *transcoder;
00055
00056 int flags;
00057
00058 int resume_position;
00059 unsigned int next_table;
00060 VALUE next_info;
00061 unsigned char next_byte;
00062 unsigned int output_index;
00063
00064 ssize_t recognized_len;
00065 ssize_t readagain_len;
00066 union {
00067 unsigned char ary[8];
00068 unsigned char *ptr;
00069 } readbuf;
00070
00071 ssize_t writebuf_off;
00072 ssize_t writebuf_len;
00073 union {
00074 unsigned char ary[8];
00075 unsigned char *ptr;
00076 } writebuf;
00077
00078 union rb_transcoding_state_t {
00079 void *ptr;
00080 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00081 double dummy_for_alignment;
00082 } state;
00083 } rb_transcoding;
00084 #define TRANSCODING_READBUF(tc) \
00085 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00086 (tc)->readbuf.ary : \
00087 (tc)->readbuf.ptr)
00088 #define TRANSCODING_WRITEBUF(tc) \
00089 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00090 (tc)->writebuf.ary : \
00091 (tc)->writebuf.ptr)
00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00093 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00094 sizeof((tc)->writebuf.ary) : \
00095 (size_t)(tc)->transcoder->max_output)
00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00097 #define TRANSCODING_STATE(tc) \
00098 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00099 (tc)->state.ary : \
00100 (tc)->state.ptr)
00101
00102 typedef struct {
00103 struct rb_transcoding *tc;
00104 unsigned char *out_buf_start;
00105 unsigned char *out_data_start;
00106 unsigned char *out_data_end;
00107 unsigned char *out_buf_end;
00108 rb_econv_result_t last_result;
00109 } rb_econv_elem_t;
00110
00111 struct rb_econv_t {
00112 int flags;
00113 const char *source_encoding_name;
00114 const char *destination_encoding_name;
00115
00116 int started;
00117
00118 const unsigned char *replacement_str;
00119 size_t replacement_len;
00120 const char *replacement_enc;
00121 int replacement_allocated;
00122
00123 unsigned char *in_buf_start;
00124 unsigned char *in_data_start;
00125 unsigned char *in_data_end;
00126 unsigned char *in_buf_end;
00127 rb_econv_elem_t *elems;
00128 int num_allocated;
00129 int num_trans;
00130 int num_finished;
00131 struct rb_transcoding *last_tc;
00132
00133
00134 struct {
00135 rb_econv_result_t result;
00136 struct rb_transcoding *error_tc;
00137 const char *source_encoding;
00138 const char *destination_encoding;
00139 const unsigned char *error_bytes_start;
00140 size_t error_bytes_len;
00141 size_t readagain_len;
00142 } last_error;
00143
00144
00145
00146 rb_encoding *source_encoding;
00147 rb_encoding *destination_encoding;
00148 };
00149
00150
00151
00152
00153
00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00155
00156 typedef struct {
00157 const char *sname;
00158 const char *dname;
00159 const char *lib;
00160 const rb_transcoder *transcoder;
00161 } transcoder_entry_t;
00162
00163 static st_table *transcoder_table;
00164
00165 static transcoder_entry_t *
00166 make_transcoder_entry(const char *sname, const char *dname)
00167 {
00168 st_data_t val;
00169 st_table *table2;
00170
00171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00172 val = (st_data_t)st_init_strcasetable();
00173 st_add_direct(transcoder_table, (st_data_t)sname, val);
00174 }
00175 table2 = (st_table *)val;
00176 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00177 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00178 entry->sname = sname;
00179 entry->dname = dname;
00180 entry->lib = NULL;
00181 entry->transcoder = NULL;
00182 val = (st_data_t)entry;
00183 st_add_direct(table2, (st_data_t)dname, val);
00184 }
00185 return (transcoder_entry_t *)val;
00186 }
00187
00188 static transcoder_entry_t *
00189 get_transcoder_entry(const char *sname, const char *dname)
00190 {
00191 st_data_t val;
00192 st_table *table2;
00193
00194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00195 return NULL;
00196 }
00197 table2 = (st_table *)val;
00198 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00199 return NULL;
00200 }
00201 return (transcoder_entry_t *)val;
00202 }
00203
00204 void
00205 rb_register_transcoder(const rb_transcoder *tr)
00206 {
00207 const char *const sname = tr->src_encoding;
00208 const char *const dname = tr->dst_encoding;
00209
00210 transcoder_entry_t *entry;
00211
00212 entry = make_transcoder_entry(sname, dname);
00213 if (entry->transcoder) {
00214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00215 sname, dname);
00216 }
00217
00218 entry->transcoder = tr;
00219 }
00220
00221 static void
00222 declare_transcoder(const char *sname, const char *dname, const char *lib)
00223 {
00224 transcoder_entry_t *entry;
00225
00226 entry = make_transcoder_entry(sname, dname);
00227 entry->lib = lib;
00228 }
00229
00230 #define MAX_TRANSCODER_LIBNAME_LEN 64
00231 static const char transcoder_lib_prefix[] = "enc/trans/";
00232
00233 void
00234 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00235 {
00236 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
00237 rb_raise(rb_eArgError, "invalid library name - %s",
00238 lib ? lib : "(null)");
00239 }
00240 declare_transcoder(enc1, enc2, lib);
00241 }
00242
00243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
00244
00245 typedef struct search_path_queue_tag {
00246 struct search_path_queue_tag *next;
00247 const char *enc;
00248 } search_path_queue_t;
00249
00250 typedef struct {
00251 st_table *visited;
00252 search_path_queue_t *queue;
00253 search_path_queue_t **queue_last_ptr;
00254 const char *base_enc;
00255 } search_path_bfs_t;
00256
00257 static int
00258 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00259 {
00260 const char *dname = (const char *)key;
00261 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00262 search_path_queue_t *q;
00263
00264 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00265 return ST_CONTINUE;
00266 }
00267
00268 q = ALLOC(search_path_queue_t);
00269 q->enc = dname;
00270 q->next = NULL;
00271 *bfs->queue_last_ptr = q;
00272 bfs->queue_last_ptr = &q->next;
00273
00274 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00275 return ST_CONTINUE;
00276 }
00277
00278 static int
00279 transcode_search_path(const char *sname, const char *dname,
00280 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00281 void *arg)
00282 {
00283 search_path_bfs_t bfs;
00284 search_path_queue_t *q;
00285 st_data_t val;
00286 st_table *table2;
00287 int found;
00288 int pathlen = -1;
00289
00290 if (encoding_equal(sname, dname))
00291 return -1;
00292
00293 q = ALLOC(search_path_queue_t);
00294 q->enc = sname;
00295 q->next = NULL;
00296 bfs.queue_last_ptr = &q->next;
00297 bfs.queue = q;
00298
00299 bfs.visited = st_init_strcasetable();
00300 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00301
00302 while (bfs.queue) {
00303 q = bfs.queue;
00304 bfs.queue = q->next;
00305 if (!bfs.queue)
00306 bfs.queue_last_ptr = &bfs.queue;
00307
00308 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00309 xfree(q);
00310 continue;
00311 }
00312 table2 = (st_table *)val;
00313
00314 if (st_lookup(table2, (st_data_t)dname, &val)) {
00315 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00316 xfree(q);
00317 found = 1;
00318 goto cleanup;
00319 }
00320
00321 bfs.base_enc = q->enc;
00322 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00323 bfs.base_enc = NULL;
00324
00325 xfree(q);
00326 }
00327 found = 0;
00328
00329 cleanup:
00330 while (bfs.queue) {
00331 q = bfs.queue;
00332 bfs.queue = q->next;
00333 xfree(q);
00334 }
00335
00336 if (found) {
00337 const char *enc = dname;
00338 int depth;
00339 pathlen = 0;
00340 while (1) {
00341 st_lookup(bfs.visited, (st_data_t)enc, &val);
00342 if (!val)
00343 break;
00344 pathlen++;
00345 enc = (const char *)val;
00346 }
00347 depth = pathlen;
00348 enc = dname;
00349 while (1) {
00350 st_lookup(bfs.visited, (st_data_t)enc, &val);
00351 if (!val)
00352 break;
00353 callback((const char *)val, enc, --depth, arg);
00354 enc = (const char *)val;
00355 }
00356 }
00357
00358 st_free_table(bfs.visited);
00359
00360 return pathlen;
00361 }
00362
00363 static const rb_transcoder *
00364 load_transcoder_entry(transcoder_entry_t *entry)
00365 {
00366 if (entry->transcoder)
00367 return entry->transcoder;
00368
00369 if (entry->lib) {
00370 const char *lib = entry->lib;
00371 size_t len = strlen(lib);
00372 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
00373 VALUE fn;
00374 const int safe = rb_safe_level();
00375
00376 entry->lib = NULL;
00377
00378 if (len > MAX_TRANSCODER_LIBNAME_LEN)
00379 return NULL;
00380 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00381 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
00382 fn = rb_str_new2(path);
00383 FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED);
00384 OBJ_FREEZE(fn);
00385 if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
00386 return NULL;
00387 }
00388
00389 if (entry->transcoder)
00390 return entry->transcoder;
00391
00392 return NULL;
00393 }
00394
00395 static const char*
00396 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00397 {
00398 if (encoding_equal(encname, "UTF-8")) {
00399 *len_ret = 3;
00400 *repl_encname_ptr = "UTF-8";
00401 return "\xEF\xBF\xBD";
00402 }
00403 else {
00404 *len_ret = 1;
00405 *repl_encname_ptr = "US-ASCII";
00406 return "?";
00407 }
00408 }
00409
00410
00411
00412
00413
00414 static const unsigned char *
00415 transcode_char_start(rb_transcoding *tc,
00416 const unsigned char *in_start,
00417 const unsigned char *inchar_start,
00418 const unsigned char *in_p,
00419 size_t *char_len_ptr)
00420 {
00421 const unsigned char *ptr;
00422 if (inchar_start - in_start < tc->recognized_len) {
00423 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00424 inchar_start, unsigned char, in_p - inchar_start);
00425 ptr = TRANSCODING_READBUF(tc);
00426 }
00427 else {
00428 ptr = inchar_start - tc->recognized_len;
00429 }
00430 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00431 return ptr;
00432 }
00433
00434 static rb_econv_result_t
00435 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00436 const unsigned char *in_stop, unsigned char *out_stop,
00437 rb_transcoding *tc,
00438 const int opt)
00439 {
00440 const rb_transcoder *tr = tc->transcoder;
00441 int unitlen = tr->input_unit_length;
00442 ssize_t readagain_len = 0;
00443
00444 const unsigned char *inchar_start;
00445 const unsigned char *in_p;
00446
00447 unsigned char *out_p;
00448
00449 in_p = inchar_start = *in_pos;
00450
00451 out_p = *out_pos;
00452
00453 #define SUSPEND(ret, num) \
00454 do { \
00455 tc->resume_position = (num); \
00456 if (0 < in_p - inchar_start) \
00457 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00458 inchar_start, unsigned char, in_p - inchar_start); \
00459 *in_pos = in_p; \
00460 *out_pos = out_p; \
00461 tc->recognized_len += in_p - inchar_start; \
00462 if (readagain_len) { \
00463 tc->recognized_len -= readagain_len; \
00464 tc->readagain_len = readagain_len; \
00465 } \
00466 return (ret); \
00467 resume_label ## num:; \
00468 } while (0)
00469 #define SUSPEND_OBUF(num) \
00470 do { \
00471 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00472 } while (0)
00473
00474 #define SUSPEND_AFTER_OUTPUT(num) \
00475 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00476 SUSPEND(econv_after_output, num); \
00477 }
00478
00479 #define next_table (tc->next_table)
00480 #define next_info (tc->next_info)
00481 #define next_byte (tc->next_byte)
00482 #define writebuf_len (tc->writebuf_len)
00483 #define writebuf_off (tc->writebuf_off)
00484
00485 switch (tc->resume_position) {
00486 case 0: break;
00487 case 1: goto resume_label1;
00488 case 2: goto resume_label2;
00489 case 3: goto resume_label3;
00490 case 4: goto resume_label4;
00491 case 5: goto resume_label5;
00492 case 6: goto resume_label6;
00493 case 7: goto resume_label7;
00494 case 8: goto resume_label8;
00495 case 9: goto resume_label9;
00496 case 10: goto resume_label10;
00497 case 11: goto resume_label11;
00498 case 12: goto resume_label12;
00499 case 13: goto resume_label13;
00500 case 14: goto resume_label14;
00501 case 15: goto resume_label15;
00502 case 16: goto resume_label16;
00503 case 17: goto resume_label17;
00504 case 18: goto resume_label18;
00505 case 19: goto resume_label19;
00506 case 20: goto resume_label20;
00507 case 21: goto resume_label21;
00508 case 22: goto resume_label22;
00509 case 23: goto resume_label23;
00510 case 24: goto resume_label24;
00511 case 25: goto resume_label25;
00512 case 26: goto resume_label26;
00513 case 27: goto resume_label27;
00514 case 28: goto resume_label28;
00515 case 29: goto resume_label29;
00516 case 30: goto resume_label30;
00517 case 31: goto resume_label31;
00518 case 32: goto resume_label32;
00519 case 33: goto resume_label33;
00520 case 34: goto resume_label34;
00521 }
00522
00523 while (1) {
00524 inchar_start = in_p;
00525 tc->recognized_len = 0;
00526 next_table = tr->conv_tree_start;
00527
00528 SUSPEND_AFTER_OUTPUT(24);
00529
00530 if (in_stop <= in_p) {
00531 if (!(opt & ECONV_PARTIAL_INPUT))
00532 break;
00533 SUSPEND(econv_source_buffer_empty, 7);
00534 continue;
00535 }
00536
00537 #define BYTE_ADDR(index) (tr->byte_array + (index))
00538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00541 #define BL_MIN_BYTE (BL_BASE[0])
00542 #define BL_MAX_BYTE (BL_BASE[1])
00543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00545
00546 next_byte = (unsigned char)*in_p++;
00547 follow_byte:
00548 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00549 next_info = INVALID;
00550 else {
00551 next_info = (VALUE)BL_ACTION(next_byte);
00552 }
00553 follow_info:
00554 switch (next_info & 0x1F) {
00555 case NOMAP:
00556 {
00557 const unsigned char *p = inchar_start;
00558 writebuf_off = 0;
00559 while (p < in_p) {
00560 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00561 }
00562 writebuf_len = writebuf_off;
00563 writebuf_off = 0;
00564 while (writebuf_off < writebuf_len) {
00565 SUSPEND_OBUF(3);
00566 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00567 }
00568 }
00569 continue;
00570 case 0x00: case 0x04: case 0x08: case 0x0C:
00571 case 0x10: case 0x14: case 0x18: case 0x1C:
00572 SUSPEND_AFTER_OUTPUT(25);
00573 while (in_p >= in_stop) {
00574 if (!(opt & ECONV_PARTIAL_INPUT))
00575 goto incomplete;
00576 SUSPEND(econv_source_buffer_empty, 5);
00577 }
00578 next_byte = (unsigned char)*in_p++;
00579 next_table = (unsigned int)next_info;
00580 goto follow_byte;
00581 case ZERObt:
00582 continue;
00583 case ONEbt:
00584 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00585 continue;
00586 case TWObt:
00587 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00588 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00589 continue;
00590 case THREEbt:
00591 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00592 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00593 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00594 continue;
00595 case FOURbt:
00596 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00597 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00598 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00599 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00600 continue;
00601 case GB4bt:
00602 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00603 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00604 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00605 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00606 continue;
00607 case STR1:
00608 tc->output_index = 0;
00609 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00610 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00611 tc->output_index++;
00612 }
00613 continue;
00614 case FUNii:
00615 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00616 goto follow_info;
00617 case FUNsi:
00618 {
00619 const unsigned char *char_start;
00620 size_t char_len;
00621 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00622 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00623 goto follow_info;
00624 }
00625 case FUNio:
00626 SUSPEND_OBUF(13);
00627 if (tr->max_output <= out_stop - out_p)
00628 out_p += tr->func_io(TRANSCODING_STATE(tc),
00629 next_info, out_p, out_stop - out_p);
00630 else {
00631 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00632 next_info,
00633 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00634 writebuf_off = 0;
00635 while (writebuf_off < writebuf_len) {
00636 SUSPEND_OBUF(20);
00637 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00638 }
00639 }
00640 break;
00641 case FUNso:
00642 {
00643 const unsigned char *char_start;
00644 size_t char_len;
00645 SUSPEND_OBUF(14);
00646 if (tr->max_output <= out_stop - out_p) {
00647 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00648 out_p += tr->func_so(TRANSCODING_STATE(tc),
00649 char_start, (size_t)char_len,
00650 out_p, out_stop - out_p);
00651 }
00652 else {
00653 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00654 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00655 char_start, (size_t)char_len,
00656 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00657 writebuf_off = 0;
00658 while (writebuf_off < writebuf_len) {
00659 SUSPEND_OBUF(22);
00660 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00661 }
00662 }
00663 break;
00664 }
00665 case FUNsio:
00666 {
00667 const unsigned char *char_start;
00668 size_t char_len;
00669 SUSPEND_OBUF(33);
00670 if (tr->max_output <= out_stop - out_p) {
00671 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00672 out_p += tr->func_sio(TRANSCODING_STATE(tc),
00673 char_start, (size_t)char_len, next_info,
00674 out_p, out_stop - out_p);
00675 }
00676 else {
00677 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00678 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00679 char_start, (size_t)char_len, next_info,
00680 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00681 writebuf_off = 0;
00682 while (writebuf_off < writebuf_len) {
00683 SUSPEND_OBUF(34);
00684 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00685 }
00686 }
00687 break;
00688 }
00689 case INVALID:
00690 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00691 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00692 SUSPEND_AFTER_OUTPUT(26);
00693 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00694 in_p = in_stop;
00695 SUSPEND(econv_source_buffer_empty, 8);
00696 }
00697 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00698 in_p = in_stop;
00699 }
00700 else {
00701 in_p = inchar_start + (unitlen - tc->recognized_len);
00702 }
00703 }
00704 else {
00705 ssize_t invalid_len;
00706 ssize_t discard_len;
00707 invalid_len = tc->recognized_len + (in_p - inchar_start);
00708 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00709 readagain_len = invalid_len - discard_len;
00710 }
00711 goto invalid;
00712 case UNDEF:
00713 goto undef;
00714 default:
00715 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00716 }
00717 continue;
00718
00719 invalid:
00720 SUSPEND(econv_invalid_byte_sequence, 1);
00721 continue;
00722
00723 incomplete:
00724 SUSPEND(econv_incomplete_input, 27);
00725 continue;
00726
00727 undef:
00728 SUSPEND(econv_undefined_conversion, 2);
00729 continue;
00730 }
00731
00732
00733 if (tr->finish_func) {
00734 SUSPEND_OBUF(4);
00735 if (tr->max_output <= out_stop - out_p) {
00736 out_p += tr->finish_func(TRANSCODING_STATE(tc),
00737 out_p, out_stop - out_p);
00738 }
00739 else {
00740 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00741 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00742 writebuf_off = 0;
00743 while (writebuf_off < writebuf_len) {
00744 SUSPEND_OBUF(23);
00745 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00746 }
00747 }
00748 }
00749 while (1)
00750 SUSPEND(econv_finished, 6);
00751 #undef SUSPEND
00752 #undef next_table
00753 #undef next_info
00754 #undef next_byte
00755 #undef writebuf_len
00756 #undef writebuf_off
00757 }
00758
00759 static rb_econv_result_t
00760 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00761 const unsigned char *in_stop, unsigned char *out_stop,
00762 rb_transcoding *tc,
00763 const int opt)
00764 {
00765 if (tc->readagain_len) {
00766 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00767 const unsigned char *readagain_pos = readagain_buf;
00768 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00769 rb_econv_result_t res;
00770
00771 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00772 unsigned char, tc->readagain_len);
00773 tc->readagain_len = 0;
00774 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00775 if (res != econv_source_buffer_empty) {
00776 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00777 readagain_pos, unsigned char, readagain_stop - readagain_pos);
00778 tc->readagain_len += readagain_stop - readagain_pos;
00779 return res;
00780 }
00781 }
00782 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00783 }
00784
00785 static rb_transcoding *
00786 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00787 {
00788 rb_transcoding *tc;
00789
00790 tc = ALLOC(rb_transcoding);
00791 tc->transcoder = tr;
00792 tc->flags = flags;
00793 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00794 tc->state.ptr = xmalloc(tr->state_size);
00795 if (tr->state_init_func) {
00796 (tr->state_init_func)(TRANSCODING_STATE(tc));
00797 }
00798 tc->resume_position = 0;
00799 tc->recognized_len = 0;
00800 tc->readagain_len = 0;
00801 tc->writebuf_len = 0;
00802 tc->writebuf_off = 0;
00803 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00804 tc->readbuf.ptr = xmalloc(tr->max_input);
00805 }
00806 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00807 tc->writebuf.ptr = xmalloc(tr->max_output);
00808 }
00809 return tc;
00810 }
00811
00812 static rb_econv_result_t
00813 rb_transcoding_convert(rb_transcoding *tc,
00814 const unsigned char **input_ptr, const unsigned char *input_stop,
00815 unsigned char **output_ptr, unsigned char *output_stop,
00816 int flags)
00817 {
00818 return transcode_restartable(
00819 input_ptr, output_ptr,
00820 input_stop, output_stop,
00821 tc, flags);
00822 }
00823
00824 static void
00825 rb_transcoding_close(rb_transcoding *tc)
00826 {
00827 const rb_transcoder *tr = tc->transcoder;
00828 if (tr->state_fini_func) {
00829 (tr->state_fini_func)(TRANSCODING_STATE(tc));
00830 }
00831 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00832 xfree(tc->state.ptr);
00833 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00834 xfree(tc->readbuf.ptr);
00835 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00836 xfree(tc->writebuf.ptr);
00837 xfree(tc);
00838 }
00839
00840 static size_t
00841 rb_transcoding_memsize(rb_transcoding *tc)
00842 {
00843 size_t size = sizeof(rb_transcoding);
00844 const rb_transcoder *tr = tc->transcoder;
00845
00846 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00847 size += tr->state_size;
00848 }
00849 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00850 size += tr->max_input;
00851 }
00852 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00853 size += tr->max_output;
00854 }
00855 return size;
00856 }
00857
00858 static rb_econv_t *
00859 rb_econv_alloc(int n_hint)
00860 {
00861 rb_econv_t *ec;
00862
00863 if (n_hint <= 0)
00864 n_hint = 1;
00865
00866 ec = ALLOC(rb_econv_t);
00867 ec->flags = 0;
00868 ec->source_encoding_name = NULL;
00869 ec->destination_encoding_name = NULL;
00870 ec->started = 0;
00871 ec->replacement_str = NULL;
00872 ec->replacement_len = 0;
00873 ec->replacement_enc = NULL;
00874 ec->replacement_allocated = 0;
00875 ec->in_buf_start = NULL;
00876 ec->in_data_start = NULL;
00877 ec->in_data_end = NULL;
00878 ec->in_buf_end = NULL;
00879 ec->num_allocated = n_hint;
00880 ec->num_trans = 0;
00881 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00882 ec->num_finished = 0;
00883 ec->last_tc = NULL;
00884 ec->last_error.result = econv_source_buffer_empty;
00885 ec->last_error.error_tc = NULL;
00886 ec->last_error.source_encoding = NULL;
00887 ec->last_error.destination_encoding = NULL;
00888 ec->last_error.error_bytes_start = NULL;
00889 ec->last_error.error_bytes_len = 0;
00890 ec->last_error.readagain_len = 0;
00891 ec->source_encoding = NULL;
00892 ec->destination_encoding = NULL;
00893 return ec;
00894 }
00895
00896 static int
00897 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00898 {
00899 int n, j;
00900 int bufsize = 4096;
00901 unsigned char *p;
00902
00903 if (ec->num_trans == ec->num_allocated) {
00904 n = ec->num_allocated * 2;
00905 REALLOC_N(ec->elems, rb_econv_elem_t, n);
00906 ec->num_allocated = n;
00907 }
00908
00909 p = xmalloc(bufsize);
00910
00911 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00912
00913 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00914 ec->elems[i].out_buf_start = p;
00915 ec->elems[i].out_buf_end = p + bufsize;
00916 ec->elems[i].out_data_start = p;
00917 ec->elems[i].out_data_end = p;
00918 ec->elems[i].last_result = econv_source_buffer_empty;
00919
00920 ec->num_trans++;
00921
00922 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00923 for (j = ec->num_trans-1; i <= j; j--) {
00924 rb_transcoding *tc = ec->elems[j].tc;
00925 const rb_transcoder *tr2 = tc->transcoder;
00926 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00927 ec->last_tc = tc;
00928 break;
00929 }
00930 }
00931
00932 return 0;
00933 }
00934
00935 static rb_econv_t *
00936 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00937 {
00938 rb_econv_t *ec;
00939 int i, ret;
00940
00941 for (i = 0; i < n; i++) {
00942 const rb_transcoder *tr;
00943 tr = load_transcoder_entry(entries[i]);
00944 if (!tr)
00945 return NULL;
00946 }
00947
00948 ec = rb_econv_alloc(n);
00949
00950 for (i = 0; i < n; i++) {
00951 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00952 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00953 if (ret == -1) {
00954 rb_econv_close(ec);
00955 return NULL;
00956 }
00957 }
00958
00959 return ec;
00960 }
00961
00962 struct trans_open_t {
00963 transcoder_entry_t **entries;
00964 int num_additional;
00965 };
00966
00967 static void
00968 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00969 {
00970 struct trans_open_t *toarg = arg;
00971
00972 if (!toarg->entries) {
00973 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00974 }
00975 toarg->entries[depth] = get_transcoder_entry(sname, dname);
00976 }
00977
00978 static rb_econv_t *
00979 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00980 {
00981 transcoder_entry_t **entries = NULL;
00982 int num_trans;
00983 rb_econv_t *ec;
00984
00985 rb_encoding *senc, *denc;
00986 int sidx, didx;
00987
00988 senc = NULL;
00989 if (*sname) {
00990 sidx = rb_enc_find_index(sname);
00991 if (0 <= sidx) {
00992 senc = rb_enc_from_index(sidx);
00993 }
00994 }
00995
00996 denc = NULL;
00997 if (*dname) {
00998 didx = rb_enc_find_index(dname);
00999 if (0 <= didx) {
01000 denc = rb_enc_from_index(didx);
01001 }
01002 }
01003
01004 if (*sname == '\0' && *dname == '\0') {
01005 num_trans = 0;
01006 entries = NULL;
01007 }
01008 else {
01009 struct trans_open_t toarg;
01010 toarg.entries = NULL;
01011 toarg.num_additional = 0;
01012 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01013 entries = toarg.entries;
01014 if (num_trans < 0) {
01015 xfree(entries);
01016 return NULL;
01017 }
01018 }
01019
01020 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01021 xfree(entries);
01022 if (!ec)
01023 return NULL;
01024
01025 ec->flags = ecflags;
01026 ec->source_encoding_name = sname;
01027 ec->destination_encoding_name = dname;
01028
01029 return ec;
01030 }
01031
01032 #define MAX_ECFLAGS_DECORATORS 32
01033
01034 static int
01035 decorator_names(int ecflags, const char **decorators_ret)
01036 {
01037 int num_decorators;
01038
01039 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
01040 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01041 case ECONV_CRLF_NEWLINE_DECORATOR:
01042 case ECONV_CR_NEWLINE_DECORATOR:
01043 case 0:
01044 break;
01045 default:
01046 return -1;
01047 }
01048
01049 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01050 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01051 return -1;
01052
01053 num_decorators = 0;
01054
01055 if (ecflags & ECONV_XML_TEXT_DECORATOR)
01056 decorators_ret[num_decorators++] = "xml_text_escape";
01057 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01058 decorators_ret[num_decorators++] = "xml_attr_content_escape";
01059 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01060 decorators_ret[num_decorators++] = "xml_attr_quote";
01061
01062 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01063 decorators_ret[num_decorators++] = "crlf_newline";
01064 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01065 decorators_ret[num_decorators++] = "cr_newline";
01066 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01067 decorators_ret[num_decorators++] = "universal_newline";
01068
01069 return num_decorators;
01070 }
01071
01072 rb_econv_t *
01073 rb_econv_open(const char *sname, const char *dname, int ecflags)
01074 {
01075 rb_econv_t *ec;
01076 int num_decorators;
01077 const char *decorators[MAX_ECFLAGS_DECORATORS];
01078 int i;
01079
01080 num_decorators = decorator_names(ecflags, decorators);
01081 if (num_decorators == -1)
01082 return NULL;
01083
01084 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01085 if (!ec)
01086 return NULL;
01087
01088 for (i = 0; i < num_decorators; i++)
01089 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01090 rb_econv_close(ec);
01091 return NULL;
01092 }
01093
01094 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01095
01096 return ec;
01097 }
01098
01099 static int
01100 trans_sweep(rb_econv_t *ec,
01101 const unsigned char **input_ptr, const unsigned char *input_stop,
01102 unsigned char **output_ptr, unsigned char *output_stop,
01103 int flags,
01104 int start)
01105 {
01106 int try;
01107 int i, f;
01108
01109 const unsigned char **ipp, *is, *iold;
01110 unsigned char **opp, *os, *oold;
01111 rb_econv_result_t res;
01112
01113 try = 1;
01114 while (try) {
01115 try = 0;
01116 for (i = start; i < ec->num_trans; i++) {
01117 rb_econv_elem_t *te = &ec->elems[i];
01118
01119 if (i == 0) {
01120 ipp = input_ptr;
01121 is = input_stop;
01122 }
01123 else {
01124 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01125 ipp = (const unsigned char **)&prev_te->out_data_start;
01126 is = prev_te->out_data_end;
01127 }
01128
01129 if (i == ec->num_trans-1) {
01130 opp = output_ptr;
01131 os = output_stop;
01132 }
01133 else {
01134 if (te->out_buf_start != te->out_data_start) {
01135 ssize_t len = te->out_data_end - te->out_data_start;
01136 ssize_t off = te->out_data_start - te->out_buf_start;
01137 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01138 te->out_data_start = te->out_buf_start;
01139 te->out_data_end -= off;
01140 }
01141 opp = &te->out_data_end;
01142 os = te->out_buf_end;
01143 }
01144
01145 f = flags;
01146 if (ec->num_finished != i)
01147 f |= ECONV_PARTIAL_INPUT;
01148 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01149 start = 1;
01150 flags &= ~ECONV_AFTER_OUTPUT;
01151 }
01152 if (i != 0)
01153 f &= ~ECONV_AFTER_OUTPUT;
01154 iold = *ipp;
01155 oold = *opp;
01156 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01157 if (iold != *ipp || oold != *opp)
01158 try = 1;
01159
01160 switch (res) {
01161 case econv_invalid_byte_sequence:
01162 case econv_incomplete_input:
01163 case econv_undefined_conversion:
01164 case econv_after_output:
01165 return i;
01166
01167 case econv_destination_buffer_full:
01168 case econv_source_buffer_empty:
01169 break;
01170
01171 case econv_finished:
01172 ec->num_finished = i+1;
01173 break;
01174 }
01175 }
01176 }
01177 return -1;
01178 }
01179
01180 static rb_econv_result_t
01181 rb_trans_conv(rb_econv_t *ec,
01182 const unsigned char **input_ptr, const unsigned char *input_stop,
01183 unsigned char **output_ptr, unsigned char *output_stop,
01184 int flags,
01185 int *result_position_ptr)
01186 {
01187 int i;
01188 int needreport_index;
01189 int sweep_start;
01190
01191 unsigned char empty_buf;
01192 unsigned char *empty_ptr = &empty_buf;
01193
01194 if (!input_ptr) {
01195 input_ptr = (const unsigned char **)&empty_ptr;
01196 input_stop = empty_ptr;
01197 }
01198
01199 if (!output_ptr) {
01200 output_ptr = &empty_ptr;
01201 output_stop = empty_ptr;
01202 }
01203
01204 if (ec->elems[0].last_result == econv_after_output)
01205 ec->elems[0].last_result = econv_source_buffer_empty;
01206
01207 needreport_index = -1;
01208 for (i = ec->num_trans-1; 0 <= i; i--) {
01209 switch (ec->elems[i].last_result) {
01210 case econv_invalid_byte_sequence:
01211 case econv_incomplete_input:
01212 case econv_undefined_conversion:
01213 case econv_after_output:
01214 case econv_finished:
01215 sweep_start = i+1;
01216 needreport_index = i;
01217 goto found_needreport;
01218
01219 case econv_destination_buffer_full:
01220 case econv_source_buffer_empty:
01221 break;
01222
01223 default:
01224 rb_bug("unexpected transcode last result");
01225 }
01226 }
01227
01228
01229
01230 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01231 (flags & ECONV_AFTER_OUTPUT)) {
01232 rb_econv_result_t res;
01233
01234 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01235 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01236 result_position_ptr);
01237
01238 if (res == econv_source_buffer_empty)
01239 return econv_after_output;
01240 return res;
01241 }
01242
01243 sweep_start = 0;
01244
01245 found_needreport:
01246
01247 do {
01248 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01249 sweep_start = needreport_index + 1;
01250 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01251
01252 for (i = ec->num_trans-1; 0 <= i; i--) {
01253 if (ec->elems[i].last_result != econv_source_buffer_empty) {
01254 rb_econv_result_t res = ec->elems[i].last_result;
01255 if (res == econv_invalid_byte_sequence ||
01256 res == econv_incomplete_input ||
01257 res == econv_undefined_conversion ||
01258 res == econv_after_output) {
01259 ec->elems[i].last_result = econv_source_buffer_empty;
01260 }
01261 if (result_position_ptr)
01262 *result_position_ptr = i;
01263 return res;
01264 }
01265 }
01266 if (result_position_ptr)
01267 *result_position_ptr = -1;
01268 return econv_source_buffer_empty;
01269 }
01270
01271 static rb_econv_result_t
01272 rb_econv_convert0(rb_econv_t *ec,
01273 const unsigned char **input_ptr, const unsigned char *input_stop,
01274 unsigned char **output_ptr, unsigned char *output_stop,
01275 int flags)
01276 {
01277 rb_econv_result_t res;
01278 int result_position;
01279 int has_output = 0;
01280
01281 memset(&ec->last_error, 0, sizeof(ec->last_error));
01282
01283 if (ec->num_trans == 0) {
01284 size_t len;
01285 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01286 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01287 len = output_stop - *output_ptr;
01288 memcpy(*output_ptr, ec->in_data_start, len);
01289 *output_ptr = output_stop;
01290 ec->in_data_start += len;
01291 res = econv_destination_buffer_full;
01292 goto gotresult;
01293 }
01294 len = ec->in_data_end - ec->in_data_start;
01295 memcpy(*output_ptr, ec->in_data_start, len);
01296 *output_ptr += len;
01297 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01298 if (flags & ECONV_AFTER_OUTPUT) {
01299 res = econv_after_output;
01300 goto gotresult;
01301 }
01302 }
01303 if (output_stop - *output_ptr < input_stop - *input_ptr) {
01304 len = output_stop - *output_ptr;
01305 }
01306 else {
01307 len = input_stop - *input_ptr;
01308 }
01309 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01310 *(*output_ptr)++ = *(*input_ptr)++;
01311 res = econv_after_output;
01312 goto gotresult;
01313 }
01314 memcpy(*output_ptr, *input_ptr, len);
01315 *output_ptr += len;
01316 *input_ptr += len;
01317 if (*input_ptr != input_stop)
01318 res = econv_destination_buffer_full;
01319 else if (flags & ECONV_PARTIAL_INPUT)
01320 res = econv_source_buffer_empty;
01321 else
01322 res = econv_finished;
01323 goto gotresult;
01324 }
01325
01326 if (ec->elems[ec->num_trans-1].out_data_start) {
01327 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01328 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01329 if (data_start != data_end) {
01330 size_t len;
01331 if (output_stop - *output_ptr < data_end - data_start) {
01332 len = output_stop - *output_ptr;
01333 memcpy(*output_ptr, data_start, len);
01334 *output_ptr = output_stop;
01335 ec->elems[ec->num_trans-1].out_data_start += len;
01336 res = econv_destination_buffer_full;
01337 goto gotresult;
01338 }
01339 len = data_end - data_start;
01340 memcpy(*output_ptr, data_start, len);
01341 *output_ptr += len;
01342 ec->elems[ec->num_trans-1].out_data_start =
01343 ec->elems[ec->num_trans-1].out_data_end =
01344 ec->elems[ec->num_trans-1].out_buf_start;
01345 has_output = 1;
01346 }
01347 }
01348
01349 if (ec->in_buf_start &&
01350 ec->in_data_start != ec->in_data_end) {
01351 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01352 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01353 if (res != econv_source_buffer_empty)
01354 goto gotresult;
01355 }
01356
01357 if (has_output &&
01358 (flags & ECONV_AFTER_OUTPUT) &&
01359 *input_ptr != input_stop) {
01360 input_stop = *input_ptr;
01361 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01362 if (res == econv_source_buffer_empty)
01363 res = econv_after_output;
01364 }
01365 else if ((flags & ECONV_AFTER_OUTPUT) ||
01366 ec->num_trans == 1) {
01367 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01368 }
01369 else {
01370 flags |= ECONV_AFTER_OUTPUT;
01371 do {
01372 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01373 } while (res == econv_after_output);
01374 }
01375
01376 gotresult:
01377 ec->last_error.result = res;
01378 if (res == econv_invalid_byte_sequence ||
01379 res == econv_incomplete_input ||
01380 res == econv_undefined_conversion) {
01381 rb_transcoding *error_tc = ec->elems[result_position].tc;
01382 ec->last_error.error_tc = error_tc;
01383 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01384 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01385 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01386 ec->last_error.error_bytes_len = error_tc->recognized_len;
01387 ec->last_error.readagain_len = error_tc->readagain_len;
01388 }
01389
01390 return res;
01391 }
01392
01393 static int output_replacement_character(rb_econv_t *ec);
01394
01395 static int
01396 output_hex_charref(rb_econv_t *ec)
01397 {
01398 int ret;
01399 unsigned char utfbuf[1024];
01400 const unsigned char *utf;
01401 size_t utf_len;
01402 int utf_allocated = 0;
01403 char charef_buf[16];
01404 const unsigned char *p;
01405
01406 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01407 utf = ec->last_error.error_bytes_start;
01408 utf_len = ec->last_error.error_bytes_len;
01409 }
01410 else {
01411 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01412 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01413 utfbuf, sizeof(utfbuf),
01414 &utf_len);
01415 if (!utf)
01416 return -1;
01417 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01418 utf_allocated = 1;
01419 }
01420
01421 if (utf_len % 4 != 0)
01422 goto fail;
01423
01424 p = utf;
01425 while (4 <= utf_len) {
01426 unsigned int u = 0;
01427 u += p[0] << 24;
01428 u += p[1] << 16;
01429 u += p[2] << 8;
01430 u += p[3];
01431 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01432
01433 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01434 if (ret == -1)
01435 goto fail;
01436
01437 p += 4;
01438 utf_len -= 4;
01439 }
01440
01441 if (utf_allocated)
01442 xfree((void *)utf);
01443 return 0;
01444
01445 fail:
01446 if (utf_allocated)
01447 xfree((void *)utf);
01448 return -1;
01449 }
01450
01451 rb_econv_result_t
01452 rb_econv_convert(rb_econv_t *ec,
01453 const unsigned char **input_ptr, const unsigned char *input_stop,
01454 unsigned char **output_ptr, unsigned char *output_stop,
01455 int flags)
01456 {
01457 rb_econv_result_t ret;
01458
01459 unsigned char empty_buf;
01460 unsigned char *empty_ptr = &empty_buf;
01461
01462 ec->started = 1;
01463
01464 if (!input_ptr) {
01465 input_ptr = (const unsigned char **)&empty_ptr;
01466 input_stop = empty_ptr;
01467 }
01468
01469 if (!output_ptr) {
01470 output_ptr = &empty_ptr;
01471 output_stop = empty_ptr;
01472 }
01473
01474 resume:
01475 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01476
01477 if (ret == econv_invalid_byte_sequence ||
01478 ret == econv_incomplete_input) {
01479
01480
01481 switch (ec->flags & ECONV_INVALID_MASK) {
01482 case ECONV_INVALID_REPLACE:
01483 if (output_replacement_character(ec) == 0)
01484 goto resume;
01485 }
01486 }
01487
01488 if (ret == econv_undefined_conversion) {
01489
01490
01491
01492 switch (ec->flags & ECONV_UNDEF_MASK) {
01493 case ECONV_UNDEF_REPLACE:
01494 if (output_replacement_character(ec) == 0)
01495 goto resume;
01496 break;
01497
01498 case ECONV_UNDEF_HEX_CHARREF:
01499 if (output_hex_charref(ec) == 0)
01500 goto resume;
01501 break;
01502 }
01503 }
01504
01505 return ret;
01506 }
01507
01508 const char *
01509 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01510 {
01511 rb_transcoding *tc = ec->last_tc;
01512 const rb_transcoder *tr;
01513
01514 if (tc == NULL)
01515 return "";
01516
01517 tr = tc->transcoder;
01518
01519 if (tr->asciicompat_type == asciicompat_encoder)
01520 return tr->src_encoding;
01521 return tr->dst_encoding;
01522 }
01523
01524 static unsigned char *
01525 allocate_converted_string(const char *sname, const char *dname,
01526 const unsigned char *str, size_t len,
01527 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01528 size_t *dst_len_ptr)
01529 {
01530 unsigned char *dst_str;
01531 size_t dst_len;
01532 size_t dst_bufsize;
01533
01534 rb_econv_t *ec;
01535 rb_econv_result_t res;
01536
01537 const unsigned char *sp;
01538 unsigned char *dp;
01539
01540 if (caller_dst_buf)
01541 dst_bufsize = caller_dst_bufsize;
01542 else if (len == 0)
01543 dst_bufsize = 1;
01544 else
01545 dst_bufsize = len;
01546
01547 ec = rb_econv_open(sname, dname, 0);
01548 if (ec == NULL)
01549 return NULL;
01550 if (caller_dst_buf)
01551 dst_str = caller_dst_buf;
01552 else
01553 dst_str = xmalloc(dst_bufsize);
01554 dst_len = 0;
01555 sp = str;
01556 dp = dst_str+dst_len;
01557 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01558 dst_len = dp - dst_str;
01559 while (res == econv_destination_buffer_full) {
01560 if (SIZE_MAX/2 < dst_bufsize) {
01561 goto fail;
01562 }
01563 dst_bufsize *= 2;
01564 if (dst_str == caller_dst_buf) {
01565 unsigned char *tmp;
01566 tmp = xmalloc(dst_bufsize);
01567 memcpy(tmp, dst_str, dst_bufsize/2);
01568 dst_str = tmp;
01569 }
01570 else {
01571 dst_str = xrealloc(dst_str, dst_bufsize);
01572 }
01573 dp = dst_str+dst_len;
01574 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01575 dst_len = dp - dst_str;
01576 }
01577 if (res != econv_finished) {
01578 goto fail;
01579 }
01580 rb_econv_close(ec);
01581 *dst_len_ptr = dst_len;
01582 return dst_str;
01583
01584 fail:
01585 if (dst_str != caller_dst_buf)
01586 xfree(dst_str);
01587 rb_econv_close(ec);
01588 return NULL;
01589 }
01590
01591
01592 int
01593 rb_econv_insert_output(rb_econv_t *ec,
01594 const unsigned char *str, size_t len, const char *str_encoding)
01595 {
01596 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01597 unsigned char insert_buf[4096];
01598 const unsigned char *insert_str = NULL;
01599 size_t insert_len;
01600
01601 int last_trans_index;
01602 rb_transcoding *tc;
01603
01604 unsigned char **buf_start_p;
01605 unsigned char **data_start_p;
01606 unsigned char **data_end_p;
01607 unsigned char **buf_end_p;
01608
01609 size_t need;
01610
01611 ec->started = 1;
01612
01613 if (len == 0)
01614 return 0;
01615
01616 if (encoding_equal(insert_encoding, str_encoding)) {
01617 insert_str = str;
01618 insert_len = len;
01619 }
01620 else {
01621 insert_str = allocate_converted_string(str_encoding, insert_encoding,
01622 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01623 if (insert_str == NULL)
01624 return -1;
01625 }
01626
01627 need = insert_len;
01628
01629 last_trans_index = ec->num_trans-1;
01630 if (ec->num_trans == 0) {
01631 tc = NULL;
01632 buf_start_p = &ec->in_buf_start;
01633 data_start_p = &ec->in_data_start;
01634 data_end_p = &ec->in_data_end;
01635 buf_end_p = &ec->in_buf_end;
01636 }
01637 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01638 tc = ec->elems[last_trans_index].tc;
01639 need += tc->readagain_len;
01640 if (need < insert_len)
01641 goto fail;
01642 if (last_trans_index == 0) {
01643 buf_start_p = &ec->in_buf_start;
01644 data_start_p = &ec->in_data_start;
01645 data_end_p = &ec->in_data_end;
01646 buf_end_p = &ec->in_buf_end;
01647 }
01648 else {
01649 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01650 buf_start_p = &ee->out_buf_start;
01651 data_start_p = &ee->out_data_start;
01652 data_end_p = &ee->out_data_end;
01653 buf_end_p = &ee->out_buf_end;
01654 }
01655 }
01656 else {
01657 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01658 buf_start_p = &ee->out_buf_start;
01659 data_start_p = &ee->out_data_start;
01660 data_end_p = &ee->out_data_end;
01661 buf_end_p = &ee->out_buf_end;
01662 tc = ec->elems[last_trans_index].tc;
01663 }
01664
01665 if (*buf_start_p == NULL) {
01666 unsigned char *buf = xmalloc(need);
01667 *buf_start_p = buf;
01668 *data_start_p = buf;
01669 *data_end_p = buf;
01670 *buf_end_p = buf+need;
01671 }
01672 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01673 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01674 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01675 *data_start_p = *buf_start_p;
01676 if ((size_t)(*buf_end_p - *data_end_p) < need) {
01677 unsigned char *buf;
01678 size_t s = (*data_end_p - *buf_start_p) + need;
01679 if (s < need)
01680 goto fail;
01681 buf = xrealloc(*buf_start_p, s);
01682 *data_start_p = buf;
01683 *data_end_p = buf + (*data_end_p - *buf_start_p);
01684 *buf_start_p = buf;
01685 *buf_end_p = buf + s;
01686 }
01687 }
01688
01689 memcpy(*data_end_p, insert_str, insert_len);
01690 *data_end_p += insert_len;
01691 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01692 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01693 *data_end_p += tc->readagain_len;
01694 tc->readagain_len = 0;
01695 }
01696
01697 if (insert_str != str && insert_str != insert_buf)
01698 xfree((void*)insert_str);
01699 return 0;
01700
01701 fail:
01702 if (insert_str != str && insert_str != insert_buf)
01703 xfree((void*)insert_str);
01704 return -1;
01705 }
01706
01707 void
01708 rb_econv_close(rb_econv_t *ec)
01709 {
01710 int i;
01711
01712 if (ec->replacement_allocated) {
01713 xfree((void *)ec->replacement_str);
01714 }
01715 for (i = 0; i < ec->num_trans; i++) {
01716 rb_transcoding_close(ec->elems[i].tc);
01717 if (ec->elems[i].out_buf_start)
01718 xfree(ec->elems[i].out_buf_start);
01719 }
01720 xfree(ec->in_buf_start);
01721 xfree(ec->elems);
01722 xfree(ec);
01723 }
01724
01725 size_t
01726 rb_econv_memsize(rb_econv_t *ec)
01727 {
01728 size_t size = sizeof(rb_econv_t);
01729 int i;
01730
01731 if (ec->replacement_allocated) {
01732 size += ec->replacement_len;
01733 }
01734 for (i = 0; i < ec->num_trans; i++) {
01735 size += rb_transcoding_memsize(ec->elems[i].tc);
01736
01737 if (ec->elems[i].out_buf_start) {
01738 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01739 }
01740 }
01741 size += ec->in_buf_end - ec->in_buf_start;
01742 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01743
01744 return size;
01745 }
01746
01747 int
01748 rb_econv_putbackable(rb_econv_t *ec)
01749 {
01750 if (ec->num_trans == 0)
01751 return 0;
01752 #if SIZEOF_SIZE_T > SIZEOF_INT
01753 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01754 #endif
01755 return (int)ec->elems[0].tc->readagain_len;
01756 }
01757
01758 void
01759 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01760 {
01761 rb_transcoding *tc;
01762 if (ec->num_trans == 0 || n == 0)
01763 return;
01764 tc = ec->elems[0].tc;
01765 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01766 tc->readagain_len -= n;
01767 }
01768
01769 struct asciicompat_encoding_t {
01770 const char *ascii_compat_name;
01771 const char *ascii_incompat_name;
01772 };
01773
01774 static int
01775 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01776 {
01777 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01778 transcoder_entry_t *entry = (transcoder_entry_t *)val;
01779 const rb_transcoder *tr;
01780
01781 if (DECORATOR_P(entry->sname, entry->dname))
01782 return ST_CONTINUE;
01783 tr = load_transcoder_entry(entry);
01784 if (tr && tr->asciicompat_type == asciicompat_decoder) {
01785 data->ascii_compat_name = tr->dst_encoding;
01786 return ST_STOP;
01787 }
01788 return ST_CONTINUE;
01789 }
01790
01791 const char *
01792 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01793 {
01794 st_data_t v;
01795 st_table *table2;
01796 struct asciicompat_encoding_t data;
01797
01798 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01799 return NULL;
01800 table2 = (st_table *)v;
01801
01802
01803
01804
01805
01806
01807
01808
01809 if (table2->num_entries != 1)
01810 return NULL;
01811
01812 data.ascii_incompat_name = ascii_incompat_name;
01813 data.ascii_compat_name = NULL;
01814 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01815 return data.ascii_compat_name;
01816 }
01817
01818 VALUE
01819 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01820 {
01821 unsigned const char *ss, *sp, *se;
01822 unsigned char *ds, *dp, *de;
01823 rb_econv_result_t res;
01824 int max_output;
01825
01826 if (NIL_P(dst)) {
01827 dst = rb_str_buf_new(len);
01828 if (ec->destination_encoding)
01829 rb_enc_associate(dst, ec->destination_encoding);
01830 }
01831
01832 if (ec->last_tc)
01833 max_output = ec->last_tc->transcoder->max_output;
01834 else
01835 max_output = 1;
01836
01837 res = econv_destination_buffer_full;
01838 while (res == econv_destination_buffer_full) {
01839 long dlen = RSTRING_LEN(dst);
01840 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01841 unsigned long new_capa = (unsigned long)dlen + len + max_output;
01842 if (LONG_MAX < new_capa)
01843 rb_raise(rb_eArgError, "too long string");
01844 rb_str_resize(dst, new_capa);
01845 rb_str_set_len(dst, dlen);
01846 }
01847 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01848 se = ss + len;
01849 ds = (unsigned char *)RSTRING_PTR(dst);
01850 de = ds + rb_str_capacity(dst);
01851 dp = ds += dlen;
01852 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01853 off += sp - ss;
01854 len -= sp - ss;
01855 rb_str_set_len(dst, dlen + (dp - ds));
01856 rb_econv_check_error(ec);
01857 }
01858
01859 return dst;
01860 }
01861
01862 VALUE
01863 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01864 {
01865 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01866 }
01867
01868 VALUE
01869 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01870 {
01871 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01872 }
01873
01874 VALUE
01875 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01876 {
01877 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01878 }
01879
01880 static int
01881 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01882 {
01883 transcoder_entry_t *entry;
01884 const rb_transcoder *tr;
01885
01886 if (ec->started != 0)
01887 return -1;
01888
01889 entry = get_transcoder_entry(sname, dname);
01890 if (!entry)
01891 return -1;
01892
01893 tr = load_transcoder_entry(entry);
01894 if (!tr) return -1;
01895
01896 return rb_econv_add_transcoder_at(ec, tr, n);
01897 }
01898
01899 static int
01900 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01901 {
01902 return rb_econv_add_converter(ec, "", decorator_name, n);
01903 }
01904
01905 int
01906 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01907 {
01908 const rb_transcoder *tr;
01909
01910 if (ec->num_trans == 0)
01911 return rb_econv_decorate_at(ec, decorator_name, 0);
01912
01913 tr = ec->elems[0].tc->transcoder;
01914
01915 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01916 tr->asciicompat_type == asciicompat_decoder)
01917 return rb_econv_decorate_at(ec, decorator_name, 1);
01918
01919 return rb_econv_decorate_at(ec, decorator_name, 0);
01920 }
01921
01922 int
01923 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01924 {
01925 const rb_transcoder *tr;
01926
01927 if (ec->num_trans == 0)
01928 return rb_econv_decorate_at(ec, decorator_name, 0);
01929
01930 tr = ec->elems[ec->num_trans-1].tc->transcoder;
01931
01932 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01933 tr->asciicompat_type == asciicompat_encoder)
01934 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01935
01936 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01937 }
01938
01939 void
01940 rb_econv_binmode(rb_econv_t *ec)
01941 {
01942 const rb_transcoder *trs[3];
01943 int n, i, j;
01944 transcoder_entry_t *entry;
01945 int num_trans;
01946
01947 n = 0;
01948 if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
01949 entry = get_transcoder_entry("", "universal_newline");
01950 if (entry->transcoder)
01951 trs[n++] = entry->transcoder;
01952 }
01953 if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
01954 entry = get_transcoder_entry("", "crlf_newline");
01955 if (entry->transcoder)
01956 trs[n++] = entry->transcoder;
01957 }
01958 if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
01959 entry = get_transcoder_entry("", "cr_newline");
01960 if (entry->transcoder)
01961 trs[n++] = entry->transcoder;
01962 }
01963
01964 num_trans = ec->num_trans;
01965 j = 0;
01966 for (i = 0; i < num_trans; i++) {
01967 int k;
01968 for (k = 0; k < n; k++)
01969 if (trs[k] == ec->elems[i].tc->transcoder)
01970 break;
01971 if (k == n) {
01972 ec->elems[j] = ec->elems[i];
01973 j++;
01974 }
01975 else {
01976 rb_transcoding_close(ec->elems[i].tc);
01977 xfree(ec->elems[i].out_buf_start);
01978 ec->num_trans--;
01979 }
01980 }
01981
01982 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
01983
01984 }
01985
01986 static VALUE
01987 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01988 {
01989 int has_description = 0;
01990
01991 if (NIL_P(mesg))
01992 mesg = rb_str_new(NULL, 0);
01993
01994 if (*sname != '\0' || *dname != '\0') {
01995 if (*sname == '\0')
01996 rb_str_cat2(mesg, dname);
01997 else if (*dname == '\0')
01998 rb_str_cat2(mesg, sname);
01999 else
02000 rb_str_catf(mesg, "%s to %s", sname, dname);
02001 has_description = 1;
02002 }
02003
02004 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02005 ECONV_XML_TEXT_DECORATOR|
02006 ECONV_XML_ATTR_CONTENT_DECORATOR|
02007 ECONV_XML_ATTR_QUOTE_DECORATOR)) {
02008 const char *pre = "";
02009 if (has_description)
02010 rb_str_cat2(mesg, " with ");
02011 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
02012 rb_str_cat2(mesg, pre); pre = ",";
02013 rb_str_cat2(mesg, "universal_newline");
02014 }
02015 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
02016 rb_str_cat2(mesg, pre); pre = ",";
02017 rb_str_cat2(mesg, "crlf_newline");
02018 }
02019 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02020 rb_str_cat2(mesg, pre); pre = ",";
02021 rb_str_cat2(mesg, "cr_newline");
02022 }
02023 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02024 rb_str_cat2(mesg, pre); pre = ",";
02025 rb_str_cat2(mesg, "xml_text");
02026 }
02027 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02028 rb_str_cat2(mesg, pre); pre = ",";
02029 rb_str_cat2(mesg, "xml_attr_content");
02030 }
02031 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02032 rb_str_cat2(mesg, pre); pre = ",";
02033 rb_str_cat2(mesg, "xml_attr_quote");
02034 }
02035 has_description = 1;
02036 }
02037 if (!has_description) {
02038 rb_str_cat2(mesg, "no-conversion");
02039 }
02040
02041 return mesg;
02042 }
02043
02044 VALUE
02045 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02046 {
02047 VALUE mesg, exc;
02048 mesg = rb_str_new_cstr("code converter not found (");
02049 econv_description(sname, dname, ecflags, mesg);
02050 rb_str_cat2(mesg, ")");
02051 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02052 return exc;
02053 }
02054
02055 static VALUE
02056 make_econv_exception(rb_econv_t *ec)
02057 {
02058 VALUE mesg, exc;
02059 if (ec->last_error.result == econv_invalid_byte_sequence ||
02060 ec->last_error.result == econv_incomplete_input) {
02061 const char *err = (const char *)ec->last_error.error_bytes_start;
02062 size_t error_len = ec->last_error.error_bytes_len;
02063 VALUE bytes = rb_str_new(err, error_len);
02064 VALUE dumped = rb_str_dump(bytes);
02065 size_t readagain_len = ec->last_error.readagain_len;
02066 VALUE bytes2 = Qnil;
02067 VALUE dumped2;
02068 int idx;
02069 if (ec->last_error.result == econv_incomplete_input) {
02070 mesg = rb_sprintf("incomplete %s on %s",
02071 StringValueCStr(dumped),
02072 ec->last_error.source_encoding);
02073 }
02074 else if (readagain_len) {
02075 bytes2 = rb_str_new(err+error_len, readagain_len);
02076 dumped2 = rb_str_dump(bytes2);
02077 mesg = rb_sprintf("%s followed by %s on %s",
02078 StringValueCStr(dumped),
02079 StringValueCStr(dumped2),
02080 ec->last_error.source_encoding);
02081 }
02082 else {
02083 mesg = rb_sprintf("%s on %s",
02084 StringValueCStr(dumped),
02085 ec->last_error.source_encoding);
02086 }
02087
02088 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02089 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02090 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02091 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02092
02093 set_encs:
02094 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02095 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02096 idx = rb_enc_find_index(ec->last_error.source_encoding);
02097 if (0 <= idx)
02098 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02099 idx = rb_enc_find_index(ec->last_error.destination_encoding);
02100 if (0 <= idx)
02101 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02102 return exc;
02103 }
02104 if (ec->last_error.result == econv_undefined_conversion) {
02105 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02106 ec->last_error.error_bytes_len);
02107 VALUE dumped = Qnil;
02108 int idx;
02109 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02110 rb_encoding *utf8 = rb_utf8_encoding();
02111 const char *start, *end;
02112 int n;
02113 start = (const char *)ec->last_error.error_bytes_start;
02114 end = start + ec->last_error.error_bytes_len;
02115 n = rb_enc_precise_mbclen(start, end, utf8);
02116 if (MBCLEN_CHARFOUND_P(n) &&
02117 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02118 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02119 dumped = rb_sprintf("U+%04X", cc);
02120 }
02121 }
02122 if (dumped == Qnil)
02123 dumped = rb_str_dump(bytes);
02124 if (strcmp(ec->last_error.source_encoding,
02125 ec->source_encoding_name) == 0 &&
02126 strcmp(ec->last_error.destination_encoding,
02127 ec->destination_encoding_name) == 0) {
02128 mesg = rb_sprintf("%s from %s to %s",
02129 StringValueCStr(dumped),
02130 ec->last_error.source_encoding,
02131 ec->last_error.destination_encoding);
02132 }
02133 else {
02134 int i;
02135 mesg = rb_sprintf("%s to %s in conversion from %s",
02136 StringValueCStr(dumped),
02137 ec->last_error.destination_encoding,
02138 ec->source_encoding_name);
02139 for (i = 0; i < ec->num_trans; i++) {
02140 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02141 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02142 rb_str_catf(mesg, " to %s",
02143 ec->elems[i].tc->transcoder->dst_encoding);
02144 }
02145 }
02146 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02147 idx = rb_enc_find_index(ec->last_error.source_encoding);
02148 if (0 <= idx)
02149 rb_enc_associate_index(bytes, idx);
02150 rb_ivar_set(exc, rb_intern("error_char"), bytes);
02151 goto set_encs;
02152 }
02153 return Qnil;
02154 }
02155
02156 static void
02157 more_output_buffer(
02158 VALUE destination,
02159 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02160 int max_output,
02161 unsigned char **out_start_ptr,
02162 unsigned char **out_pos,
02163 unsigned char **out_stop_ptr)
02164 {
02165 size_t len = (*out_pos - *out_start_ptr);
02166 size_t new_len = (len + max_output) * 2;
02167 *out_start_ptr = resize_destination(destination, len, new_len);
02168 *out_pos = *out_start_ptr + len;
02169 *out_stop_ptr = *out_start_ptr + new_len;
02170 }
02171
02172 static int
02173 make_replacement(rb_econv_t *ec)
02174 {
02175 rb_transcoding *tc;
02176 const rb_transcoder *tr;
02177 rb_encoding *enc;
02178 const unsigned char *replacement;
02179 const char *repl_enc;
02180 const char *ins_enc;
02181 size_t len;
02182
02183 if (ec->replacement_str)
02184 return 0;
02185
02186 ins_enc = rb_econv_encoding_to_insert_output(ec);
02187
02188 tc = ec->last_tc;
02189 if (*ins_enc) {
02190 tr = tc->transcoder;
02191 enc = rb_enc_find(tr->dst_encoding);
02192 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02193 }
02194 else {
02195 replacement = (unsigned char *)"?";
02196 len = 1;
02197 repl_enc = "";
02198 }
02199
02200 ec->replacement_str = replacement;
02201 ec->replacement_len = len;
02202 ec->replacement_enc = repl_enc;
02203 ec->replacement_allocated = 0;
02204 return 0;
02205 }
02206
02207 int
02208 rb_econv_set_replacement(rb_econv_t *ec,
02209 const unsigned char *str, size_t len, const char *encname)
02210 {
02211 unsigned char *str2;
02212 size_t len2;
02213 const char *encname2;
02214
02215 encname2 = rb_econv_encoding_to_insert_output(ec);
02216
02217 if (encoding_equal(encname, encname2)) {
02218 str2 = xmalloc(len);
02219 MEMCPY(str2, str, unsigned char, len);
02220 len2 = len;
02221 encname2 = encname;
02222 }
02223 else {
02224 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02225 if (!str2)
02226 return -1;
02227 }
02228
02229 if (ec->replacement_allocated) {
02230 xfree((void *)ec->replacement_str);
02231 }
02232 ec->replacement_allocated = 1;
02233 ec->replacement_str = str2;
02234 ec->replacement_len = len2;
02235 ec->replacement_enc = encname2;
02236 return 0;
02237 }
02238
02239 static int
02240 output_replacement_character(rb_econv_t *ec)
02241 {
02242 int ret;
02243
02244 if (make_replacement(ec) == -1)
02245 return -1;
02246
02247 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02248 if (ret == -1)
02249 return -1;
02250
02251 return 0;
02252 }
02253
02254 #if 1
02255 #define hash_fallback rb_hash_aref
02256
02257 static VALUE
02258 proc_fallback(VALUE fallback, VALUE c)
02259 {
02260 return rb_proc_call(fallback, rb_ary_new4(1, &c));
02261 }
02262
02263 static VALUE
02264 method_fallback(VALUE fallback, VALUE c)
02265 {
02266 return rb_method_call(1, &c, fallback);
02267 }
02268
02269 static VALUE
02270 aref_fallback(VALUE fallback, VALUE c)
02271 {
02272 return rb_funcall3(fallback, sym_aref, 1, &c);
02273 }
02274
02275 static void
02276 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02277 const unsigned char *in_stop, unsigned char *out_stop,
02278 VALUE destination,
02279 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02280 const char *src_encoding,
02281 const char *dst_encoding,
02282 int ecflags,
02283 VALUE ecopts)
02284 {
02285 rb_econv_t *ec;
02286 rb_transcoding *last_tc;
02287 rb_econv_result_t ret;
02288 unsigned char *out_start = *out_pos;
02289 int max_output;
02290 VALUE exc;
02291 VALUE fallback = Qnil;
02292 VALUE (*fallback_func)(VALUE, VALUE) = 0;
02293
02294 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02295 if (!ec)
02296 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02297
02298 if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) {
02299 fallback = rb_hash_aref(ecopts, sym_fallback);
02300 if (RB_TYPE_P(fallback, T_HASH)) {
02301 fallback_func = hash_fallback;
02302 }
02303 else if (rb_obj_is_proc(fallback)) {
02304 fallback_func = proc_fallback;
02305 }
02306 else if (rb_obj_is_method(fallback)) {
02307 fallback_func = method_fallback;
02308 }
02309 else {
02310 fallback_func = aref_fallback;
02311 }
02312 }
02313 last_tc = ec->last_tc;
02314 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02315
02316 resume:
02317 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02318
02319 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02320 VALUE rep = rb_enc_str_new(
02321 (const char *)ec->last_error.error_bytes_start,
02322 ec->last_error.error_bytes_len,
02323 rb_enc_find(ec->last_error.source_encoding));
02324 rep = (*fallback_func)(fallback, rep);
02325 if (rep != Qundef && !NIL_P(rep)) {
02326 StringValue(rep);
02327 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02328 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02329 if ((int)ret == -1) {
02330 rb_raise(rb_eArgError, "too big fallback string");
02331 }
02332 goto resume;
02333 }
02334 }
02335
02336 if (ret == econv_invalid_byte_sequence ||
02337 ret == econv_incomplete_input ||
02338 ret == econv_undefined_conversion) {
02339 exc = make_econv_exception(ec);
02340 rb_econv_close(ec);
02341 rb_exc_raise(exc);
02342 }
02343
02344 if (ret == econv_destination_buffer_full) {
02345 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02346 goto resume;
02347 }
02348
02349 rb_econv_close(ec);
02350 return;
02351 }
02352 #else
02353
02354 static void
02355 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02356 const unsigned char *in_stop, unsigned char *out_stop,
02357 VALUE destination,
02358 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02359 const char *src_encoding,
02360 const char *dst_encoding,
02361 int ecflags,
02362 VALUE ecopts)
02363 {
02364 rb_econv_t *ec;
02365 rb_transcoding *last_tc;
02366 rb_econv_result_t ret;
02367 unsigned char *out_start = *out_pos;
02368 const unsigned char *ptr;
02369 int max_output;
02370 VALUE exc;
02371
02372 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02373 if (!ec)
02374 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02375
02376 last_tc = ec->last_tc;
02377 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02378
02379 ret = econv_source_buffer_empty;
02380 ptr = *in_pos;
02381 while (ret != econv_finished) {
02382 unsigned char input_byte;
02383 const unsigned char *p = &input_byte;
02384
02385 if (ret == econv_source_buffer_empty) {
02386 if (ptr < in_stop) {
02387 input_byte = *ptr;
02388 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02389 }
02390 else {
02391 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02392 }
02393 }
02394 else {
02395 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02396 }
02397 if (&input_byte != p)
02398 ptr += p - &input_byte;
02399 switch (ret) {
02400 case econv_invalid_byte_sequence:
02401 case econv_incomplete_input:
02402 case econv_undefined_conversion:
02403 exc = make_econv_exception(ec);
02404 rb_econv_close(ec);
02405 rb_exc_raise(exc);
02406 break;
02407
02408 case econv_destination_buffer_full:
02409 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02410 break;
02411
02412 case econv_source_buffer_empty:
02413 break;
02414
02415 case econv_finished:
02416 break;
02417 }
02418 }
02419 rb_econv_close(ec);
02420 *in_pos = in_stop;
02421 return;
02422 }
02423 #endif
02424
02425
02426
02427
02428
02429
02430 static unsigned char *
02431 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02432 {
02433 rb_str_resize(destination, new_len);
02434 return (unsigned char *)RSTRING_PTR(destination);
02435 }
02436
02437 static int
02438 econv_opts(VALUE opt, int ecflags)
02439 {
02440 VALUE v;
02441
02442 v = rb_hash_aref(opt, sym_invalid);
02443 if (NIL_P(v)) {
02444 }
02445 else if (v==sym_replace) {
02446 ecflags |= ECONV_INVALID_REPLACE;
02447 }
02448 else {
02449 rb_raise(rb_eArgError, "unknown value for invalid character option");
02450 }
02451
02452 v = rb_hash_aref(opt, sym_undef);
02453 if (NIL_P(v)) {
02454 }
02455 else if (v==sym_replace) {
02456 ecflags |= ECONV_UNDEF_REPLACE;
02457 }
02458 else {
02459 rb_raise(rb_eArgError, "unknown value for undefined character option");
02460 }
02461
02462 v = rb_hash_aref(opt, sym_replace);
02463 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02464 ecflags |= ECONV_UNDEF_REPLACE;
02465 }
02466
02467 v = rb_hash_aref(opt, sym_xml);
02468 if (!NIL_P(v)) {
02469 if (v==sym_text) {
02470 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02471 }
02472 else if (v==sym_attr) {
02473 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02474 }
02475 else if (TYPE(v) == T_SYMBOL) {
02476 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02477 }
02478 else {
02479 rb_raise(rb_eArgError, "unexpected value for xml option");
02480 }
02481 }
02482
02483 #ifdef ENABLE_ECONV_NEWLINE_OPTION
02484 v = rb_hash_aref(opt, sym_newline);
02485 if (!NIL_P(v)) {
02486 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02487 if (v == sym_universal) {
02488 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02489 }
02490 else if (v == sym_crlf) {
02491 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02492 }
02493 else if (v == sym_cr) {
02494 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02495 }
02496 else if (v == sym_lf) {
02497
02498 }
02499 else if (SYMBOL_P(v)) {
02500 rb_raise(rb_eArgError, "unexpected value for newline option: %s",
02501 rb_id2name(SYM2ID(v)));
02502 }
02503 else {
02504 rb_raise(rb_eArgError, "unexpected value for newline option");
02505 }
02506 }
02507 else
02508 #endif
02509 {
02510 int setflags = 0, newlineflag = 0;
02511
02512 v = rb_hash_aref(opt, sym_universal_newline);
02513 if (RTEST(v))
02514 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02515 newlineflag |= !NIL_P(v);
02516
02517 v = rb_hash_aref(opt, sym_crlf_newline);
02518 if (RTEST(v))
02519 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02520 newlineflag |= !NIL_P(v);
02521
02522 v = rb_hash_aref(opt, sym_cr_newline);
02523 if (RTEST(v))
02524 setflags |= ECONV_CR_NEWLINE_DECORATOR;
02525 newlineflag |= !NIL_P(v);
02526
02527 if (newlineflag) {
02528 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02529 ecflags |= setflags;
02530 }
02531 }
02532
02533 return ecflags;
02534 }
02535
02536 int
02537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
02538 {
02539 VALUE newhash = Qnil;
02540 VALUE v;
02541
02542 if (NIL_P(opthash)) {
02543 *opts = Qnil;
02544 return ecflags;
02545 }
02546 ecflags = econv_opts(opthash, ecflags);
02547
02548 v = rb_hash_aref(opthash, sym_replace);
02549 if (!NIL_P(v)) {
02550 StringValue(v);
02551 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02552 VALUE dumped = rb_str_dump(v);
02553 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02554 StringValueCStr(dumped),
02555 rb_enc_name(rb_enc_get(v)));
02556 }
02557 v = rb_str_new_frozen(v);
02558 newhash = rb_hash_new();
02559 rb_hash_aset(newhash, sym_replace, v);
02560 }
02561
02562 v = rb_hash_aref(opthash, sym_fallback);
02563 if (!NIL_P(v)) {
02564 VALUE h = rb_check_hash_type(v);
02565 if (NIL_P(h)
02566 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
02567 : (v = h, 1)) {
02568 if (NIL_P(newhash))
02569 newhash = rb_hash_new();
02570 rb_hash_aset(newhash, sym_fallback, v);
02571 }
02572 }
02573
02574 if (!NIL_P(newhash))
02575 rb_hash_freeze(newhash);
02576 *opts = newhash;
02577
02578 return ecflags;
02579 }
02580
02581 int
02582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02583 {
02584 return rb_econv_prepare_options(opthash, opts, 0);
02585 }
02586
02587 rb_econv_t *
02588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02589 {
02590 rb_econv_t *ec;
02591 VALUE replacement;
02592
02593 if (NIL_P(opthash)) {
02594 replacement = Qnil;
02595 }
02596 else {
02597 if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
02598 rb_bug("rb_econv_open_opts called with invalid opthash");
02599 replacement = rb_hash_aref(opthash, sym_replace);
02600 }
02601
02602 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02603 if (!ec)
02604 return ec;
02605
02606 if (!NIL_P(replacement)) {
02607 int ret;
02608 rb_encoding *enc = rb_enc_get(replacement);
02609
02610 ret = rb_econv_set_replacement(ec,
02611 (const unsigned char *)RSTRING_PTR(replacement),
02612 RSTRING_LEN(replacement),
02613 rb_enc_name(enc));
02614 if (ret == -1) {
02615 rb_econv_close(ec);
02616 return NULL;
02617 }
02618 }
02619 return ec;
02620 }
02621
02622 static int
02623 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02624 {
02625 rb_encoding *enc;
02626 const char *n;
02627 int encidx;
02628 VALUE encval;
02629
02630 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02631 !(enc = rb_enc_from_index(encidx))) {
02632 enc = NULL;
02633 encidx = 0;
02634 n = StringValueCStr(*arg);
02635 }
02636 else {
02637 n = rb_enc_name(enc);
02638 }
02639
02640 *name_p = n;
02641 *enc_p = enc;
02642
02643 return encidx;
02644 }
02645
02646 static int
02647 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02648 const char **sname_p, rb_encoding **senc_p,
02649 const char **dname_p, rb_encoding **denc_p)
02650 {
02651 rb_encoding *senc, *denc;
02652 const char *sname, *dname;
02653 int sencidx, dencidx;
02654
02655 dencidx = enc_arg(arg1, &dname, &denc);
02656
02657 if (NIL_P(*arg2)) {
02658 sencidx = rb_enc_get_index(str);
02659 senc = rb_enc_from_index(sencidx);
02660 sname = rb_enc_name(senc);
02661 }
02662 else {
02663 sencidx = enc_arg(arg2, &sname, &senc);
02664 }
02665
02666 *sname_p = sname;
02667 *senc_p = senc;
02668 *dname_p = dname;
02669 *denc_p = denc;
02670 return dencidx;
02671 }
02672
02673 static int
02674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02675 {
02676 VALUE dest;
02677 VALUE str = *self;
02678 volatile VALUE arg1, arg2;
02679 long blen, slen;
02680 unsigned char *buf, *bp, *sp;
02681 const unsigned char *fromp;
02682 rb_encoding *senc, *denc;
02683 const char *sname, *dname;
02684 int dencidx;
02685
02686 if (argc <0 || argc > 2) {
02687 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
02688 }
02689
02690 if (argc == 0) {
02691 arg1 = rb_enc_default_internal();
02692 if (NIL_P(arg1)) {
02693 if (!ecflags) return -1;
02694 arg1 = rb_obj_encoding(str);
02695 }
02696 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02697 }
02698 else {
02699 arg1 = argv[0];
02700 }
02701 arg2 = argc<=1 ? Qnil : argv[1];
02702 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02703
02704 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02705 ECONV_XML_TEXT_DECORATOR|
02706 ECONV_XML_ATTR_CONTENT_DECORATOR|
02707 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02708 if (senc && senc == denc) {
02709 return NIL_P(arg2) ? -1 : dencidx;
02710 }
02711 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02712 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02713 return dencidx;
02714 }
02715 }
02716 if (encoding_equal(sname, dname)) {
02717 return NIL_P(arg2) ? -1 : dencidx;
02718 }
02719 }
02720 else {
02721 if (encoding_equal(sname, dname)) {
02722 sname = "";
02723 dname = "";
02724 }
02725 }
02726
02727 fromp = sp = (unsigned char *)RSTRING_PTR(str);
02728 slen = RSTRING_LEN(str);
02729 blen = slen + 30;
02730 dest = rb_str_tmp_new(blen);
02731 bp = (unsigned char *)RSTRING_PTR(dest);
02732
02733 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02734 if (fromp != sp+slen) {
02735 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02736 }
02737 buf = (unsigned char *)RSTRING_PTR(dest);
02738 *bp = '\0';
02739 rb_str_set_len(dest, bp - buf);
02740
02741
02742 if (!denc) {
02743 dencidx = rb_define_dummy_encoding(dname);
02744 }
02745 *self = dest;
02746
02747 return dencidx;
02748 }
02749
02750 static int
02751 str_transcode(int argc, VALUE *argv, VALUE *self)
02752 {
02753 VALUE opt;
02754 int ecflags = 0;
02755 VALUE ecopts = Qnil;
02756
02757 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
02758 if (!NIL_P(opt)) {
02759 ecflags = rb_econv_prepare_opts(opt, &ecopts);
02760 }
02761 return str_transcode0(argc, argv, self, ecflags, ecopts);
02762 }
02763
02764 static inline VALUE
02765 str_encode_associate(VALUE str, int encidx)
02766 {
02767 int cr = 0;
02768
02769 rb_enc_associate_index(str, encidx);
02770
02771
02772 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02773 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02774 }
02775 else {
02776 cr = ENC_CODERANGE_VALID;
02777 }
02778 ENC_CODERANGE_SET(str, cr);
02779 return str;
02780 }
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796 static VALUE
02797 str_encode_bang(int argc, VALUE *argv, VALUE str)
02798 {
02799 VALUE newstr;
02800 int encidx;
02801
02802 rb_check_frozen(str);
02803
02804 newstr = str;
02805 encidx = str_transcode(argc, argv, &newstr);
02806
02807 if (encidx < 0) return str;
02808 if (newstr == str) {
02809 rb_enc_associate_index(str, encidx);
02810 return str;
02811 }
02812 rb_str_shared_replace(str, newstr);
02813 return str_encode_associate(str, encidx);
02814 }
02815
02816 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841
02842
02843
02844
02845
02846
02847
02848
02849
02850
02851
02852
02853
02854
02855
02856
02857
02858
02859
02860
02861
02862
02863
02864
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878 static VALUE
02879 str_encode(int argc, VALUE *argv, VALUE str)
02880 {
02881 VALUE newstr = str;
02882 int encidx = str_transcode(argc, argv, &newstr);
02883 return encoded_dup(newstr, str, encidx);
02884 }
02885
02886 VALUE
02887 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02888 {
02889 int argc = 1;
02890 VALUE *argv = &to;
02891 VALUE newstr = str;
02892 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02893 return encoded_dup(newstr, str, encidx);
02894 }
02895
02896 static VALUE
02897 encoded_dup(VALUE newstr, VALUE str, int encidx)
02898 {
02899 if (encidx < 0) return rb_str_dup(str);
02900 if (newstr == str) {
02901 newstr = rb_str_dup(str);
02902 rb_enc_associate_index(newstr, encidx);
02903 return newstr;
02904 }
02905 else {
02906 RBASIC(newstr)->klass = rb_obj_class(str);
02907 }
02908 return str_encode_associate(newstr, encidx);
02909 }
02910
02911 static void
02912 econv_free(void *ptr)
02913 {
02914 rb_econv_t *ec = ptr;
02915 rb_econv_close(ec);
02916 }
02917
02918 static size_t
02919 econv_memsize(const void *ptr)
02920 {
02921 return ptr ? sizeof(rb_econv_t) : 0;
02922 }
02923
02924 static const rb_data_type_t econv_data_type = {
02925 "econv",
02926 {NULL, econv_free, econv_memsize,},
02927 };
02928
02929 static VALUE
02930 econv_s_allocate(VALUE klass)
02931 {
02932 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02933 }
02934
02935 static rb_encoding *
02936 make_dummy_encoding(const char *name)
02937 {
02938 rb_encoding *enc;
02939 int idx;
02940 idx = rb_define_dummy_encoding(name);
02941 enc = rb_enc_from_index(idx);
02942 return enc;
02943 }
02944
02945 static rb_encoding *
02946 make_encoding(const char *name)
02947 {
02948 rb_encoding *enc;
02949 enc = rb_enc_find(name);
02950 if (!enc)
02951 enc = make_dummy_encoding(name);
02952 return enc;
02953 }
02954
02955 static VALUE
02956 make_encobj(const char *name)
02957 {
02958 return rb_enc_from_encoding(make_encoding(name));
02959 }
02960
02961
02962
02963
02964
02965
02966
02967
02968
02969
02970
02971
02972
02973
02974
02975
02976
02977
02978
02979 static VALUE
02980 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02981 {
02982 const char *arg_name, *result_name;
02983 rb_encoding *arg_enc, *result_enc;
02984
02985 enc_arg(&arg, &arg_name, &arg_enc);
02986
02987 result_name = rb_econv_asciicompat_encoding(arg_name);
02988
02989 if (result_name == NULL)
02990 return Qnil;
02991
02992 result_enc = make_encoding(result_name);
02993
02994 return rb_enc_from_encoding(result_enc);
02995 }
02996
02997 static void
02998 econv_args(int argc, VALUE *argv,
02999 volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
03000 const char **sname_p, const char **dname_p,
03001 rb_encoding **senc_p, rb_encoding **denc_p,
03002 int *ecflags_p,
03003 VALUE *ecopts_p)
03004 {
03005 VALUE opt, flags_v, ecopts;
03006 int sidx, didx;
03007 const char *sname, *dname;
03008 rb_encoding *senc, *denc;
03009 int ecflags;
03010
03011 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
03012
03013 if (!NIL_P(flags_v)) {
03014 if (!NIL_P(opt)) {
03015 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)",
03016 argc + 1);
03017 }
03018 ecflags = NUM2INT(rb_to_int(flags_v));
03019 ecopts = Qnil;
03020 }
03021 else if (!NIL_P(opt)) {
03022 ecflags = rb_econv_prepare_opts(opt, &ecopts);
03023 }
03024 else {
03025 ecflags = 0;
03026 ecopts = Qnil;
03027 }
03028
03029 senc = NULL;
03030 sidx = rb_to_encoding_index(*snamev_p);
03031 if (0 <= sidx) {
03032 senc = rb_enc_from_index(sidx);
03033 }
03034 else {
03035 StringValue(*snamev_p);
03036 }
03037
03038 denc = NULL;
03039 didx = rb_to_encoding_index(*dnamev_p);
03040 if (0 <= didx) {
03041 denc = rb_enc_from_index(didx);
03042 }
03043 else {
03044 StringValue(*dnamev_p);
03045 }
03046
03047 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
03048 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
03049
03050 *sname_p = sname;
03051 *dname_p = dname;
03052 *senc_p = senc;
03053 *denc_p = denc;
03054 *ecflags_p = ecflags;
03055 *ecopts_p = ecopts;
03056 }
03057
03058 static int
03059 decorate_convpath(VALUE convpath, int ecflags)
03060 {
03061 int num_decorators;
03062 const char *decorators[MAX_ECFLAGS_DECORATORS];
03063 int i;
03064 int n, len;
03065
03066 num_decorators = decorator_names(ecflags, decorators);
03067 if (num_decorators == -1)
03068 return -1;
03069
03070 len = n = RARRAY_LENINT(convpath);
03071 if (n != 0) {
03072 VALUE pair = RARRAY_PTR(convpath)[n-1];
03073 if (TYPE(pair) == T_ARRAY) {
03074 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
03075 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
03076 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
03077 const rb_transcoder *tr = load_transcoder_entry(entry);
03078 if (!tr)
03079 return -1;
03080 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
03081 tr->asciicompat_type == asciicompat_encoder) {
03082 n--;
03083 rb_ary_store(convpath, len + num_decorators - 1, pair);
03084 }
03085 }
03086 else {
03087 rb_ary_store(convpath, len + num_decorators - 1, pair);
03088 }
03089 }
03090
03091 for (i = 0; i < num_decorators; i++)
03092 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
03093
03094 return 0;
03095 }
03096
03097 static void
03098 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03099 {
03100 VALUE *ary_p = arg;
03101 VALUE v;
03102
03103 if (*ary_p == Qnil) {
03104 *ary_p = rb_ary_new();
03105 }
03106
03107 if (DECORATOR_P(sname, dname)) {
03108 v = rb_str_new_cstr(dname);
03109 }
03110 else {
03111 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03112 }
03113 rb_ary_store(*ary_p, depth, v);
03114 }
03115
03116
03117
03118
03119
03120
03121
03122
03123
03124
03125
03126
03127
03128
03129
03130
03131
03132
03133
03134
03135
03136
03137
03138
03139
03140
03141 static VALUE
03142 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03143 {
03144 volatile VALUE snamev, dnamev;
03145 const char *sname, *dname;
03146 rb_encoding *senc, *denc;
03147 int ecflags;
03148 VALUE ecopts;
03149 VALUE convpath;
03150
03151 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03152
03153 convpath = Qnil;
03154 transcode_search_path(sname, dname, search_convpath_i, &convpath);
03155
03156 if (NIL_P(convpath))
03157 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03158
03159 if (decorate_convpath(convpath, ecflags) == -1)
03160 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03161
03162 return convpath;
03163 }
03164
03165
03166
03167
03168
03169
03170 int
03171 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03172 {
03173 VALUE convpath = Qnil;
03174 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03175 &convpath);
03176 return RTEST(convpath);
03177 }
03178
03179 struct rb_econv_init_by_convpath_t {
03180 rb_econv_t *ec;
03181 int index;
03182 int ret;
03183 };
03184
03185 static void
03186 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03187 {
03188 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03189 int ret;
03190
03191 if (a->ret == -1)
03192 return;
03193
03194 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03195
03196 a->ret = ret;
03197 return;
03198 }
03199
03200 static rb_econv_t *
03201 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03202 const char **sname_p, const char **dname_p,
03203 rb_encoding **senc_p, rb_encoding**denc_p)
03204 {
03205 rb_econv_t *ec;
03206 long i;
03207 int ret, first=1;
03208 VALUE elt;
03209 rb_encoding *senc = 0, *denc = 0;
03210 const char *sname, *dname;
03211
03212 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03213 DATA_PTR(self) = ec;
03214
03215 for (i = 0; i < RARRAY_LEN(convpath); i++) {
03216 volatile VALUE snamev, dnamev;
03217 VALUE pair;
03218 elt = rb_ary_entry(convpath, i);
03219 if (!NIL_P(pair = rb_check_array_type(elt))) {
03220 if (RARRAY_LEN(pair) != 2)
03221 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03222 snamev = rb_ary_entry(pair, 0);
03223 enc_arg(&snamev, &sname, &senc);
03224 dnamev = rb_ary_entry(pair, 1);
03225 enc_arg(&dnamev, &dname, &denc);
03226 }
03227 else {
03228 sname = "";
03229 dname = StringValueCStr(elt);
03230 }
03231 if (DECORATOR_P(sname, dname)) {
03232 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03233 if (ret == -1)
03234 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03235 }
03236 else {
03237 int j = ec->num_trans;
03238 struct rb_econv_init_by_convpath_t arg;
03239 arg.ec = ec;
03240 arg.index = ec->num_trans;
03241 arg.ret = 0;
03242 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03243 if (ret == -1 || arg.ret == -1)
03244 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03245 if (first) {
03246 first = 0;
03247 *senc_p = senc;
03248 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03249 }
03250 *denc_p = denc;
03251 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03252 }
03253 }
03254
03255 if (first) {
03256 *senc_p = NULL;
03257 *denc_p = NULL;
03258 *sname_p = "";
03259 *dname_p = "";
03260 }
03261
03262 ec->source_encoding_name = *sname_p;
03263 ec->destination_encoding_name = *dname_p;
03264
03265 return ec;
03266 }
03267
03268
03269
03270
03271
03272
03273
03274
03275
03276
03277
03278
03279
03280
03281
03282
03283
03284
03285
03286
03287
03288
03289
03290
03291
03292
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330
03331
03332
03333
03334
03335
03336
03337
03338
03339
03340
03341
03342
03343
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359
03360
03361
03362
03363
03364
03365
03366
03367
03368
03369
03370
03371
03372
03373
03374 static VALUE
03375 econv_init(int argc, VALUE *argv, VALUE self)
03376 {
03377 VALUE ecopts;
03378 volatile VALUE snamev, dnamev;
03379 const char *sname, *dname;
03380 rb_encoding *senc, *denc;
03381 rb_econv_t *ec;
03382 int ecflags;
03383 VALUE convpath;
03384
03385 if (rb_check_typeddata(self, &econv_data_type)) {
03386 rb_raise(rb_eTypeError, "already initialized");
03387 }
03388
03389 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03390 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03391 ecflags = 0;
03392 ecopts = Qnil;
03393 }
03394 else {
03395 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03396 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03397 }
03398
03399 if (!ec) {
03400 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03401 }
03402
03403 if (!DECORATOR_P(sname, dname)) {
03404 if (!senc)
03405 senc = make_dummy_encoding(sname);
03406 if (!denc)
03407 denc = make_dummy_encoding(dname);
03408 }
03409
03410 ec->source_encoding = senc;
03411 ec->destination_encoding = denc;
03412
03413 DATA_PTR(self) = ec;
03414
03415 return self;
03416 }
03417
03418
03419
03420
03421
03422
03423
03424
03425
03426
03427
03428 static VALUE
03429 econv_inspect(VALUE self)
03430 {
03431 const char *cname = rb_obj_classname(self);
03432 rb_econv_t *ec;
03433
03434 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03435 if (!ec)
03436 return rb_sprintf("#<%s: uninitialized>", cname);
03437 else {
03438 const char *sname = ec->source_encoding_name;
03439 const char *dname = ec->destination_encoding_name;
03440 VALUE str;
03441 str = rb_sprintf("#<%s: ", cname);
03442 econv_description(sname, dname, ec->flags, str);
03443 rb_str_cat2(str, ">");
03444 return str;
03445 }
03446 }
03447
03448 static rb_econv_t *
03449 check_econv(VALUE self)
03450 {
03451 rb_econv_t *ec;
03452
03453 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03454 if (!ec) {
03455 rb_raise(rb_eTypeError, "uninitialized encoding converter");
03456 }
03457 return ec;
03458 }
03459
03460
03461
03462
03463
03464
03465
03466 static VALUE
03467 econv_source_encoding(VALUE self)
03468 {
03469 rb_econv_t *ec = check_econv(self);
03470 if (!ec->source_encoding)
03471 return Qnil;
03472 return rb_enc_from_encoding(ec->source_encoding);
03473 }
03474
03475
03476
03477
03478
03479
03480
03481 static VALUE
03482 econv_destination_encoding(VALUE self)
03483 {
03484 rb_econv_t *ec = check_econv(self);
03485 if (!ec->destination_encoding)
03486 return Qnil;
03487 return rb_enc_from_encoding(ec->destination_encoding);
03488 }
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512 static VALUE
03513 econv_convpath(VALUE self)
03514 {
03515 rb_econv_t *ec = check_econv(self);
03516 VALUE result;
03517 int i;
03518
03519 result = rb_ary_new();
03520 for (i = 0; i < ec->num_trans; i++) {
03521 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03522 VALUE v;
03523 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03524 v = rb_str_new_cstr(tr->dst_encoding);
03525 else
03526 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03527 rb_ary_push(result, v);
03528 }
03529 return result;
03530 }
03531
03532
03533
03534
03535
03536 static VALUE
03537 econv_equal(VALUE self, VALUE other)
03538 {
03539 rb_econv_t *ec1 = check_econv(self);
03540 rb_econv_t *ec2;
03541 int i;
03542
03543 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
03544 return Qnil;
03545 }
03546 ec2 = DATA_PTR(other);
03547 if (!ec2) return Qfalse;
03548 if (ec1->source_encoding_name != ec2->source_encoding_name &&
03549 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
03550 return Qfalse;
03551 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
03552 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
03553 return Qfalse;
03554 if (ec1->flags != ec2->flags) return Qfalse;
03555 if (ec1->replacement_enc != ec2->replacement_enc &&
03556 strcmp(ec1->replacement_enc, ec2->replacement_enc))
03557 return Qfalse;
03558 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
03559 if (ec1->replacement_str != ec2->replacement_str &&
03560 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
03561 return Qfalse;
03562
03563 if (ec1->num_trans != ec2->num_trans) return Qfalse;
03564 for (i = 0; i < ec1->num_trans; i++) {
03565 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
03566 return Qfalse;
03567 }
03568 return Qtrue;
03569 }
03570
03571 static VALUE
03572 econv_result_to_symbol(rb_econv_result_t res)
03573 {
03574 switch (res) {
03575 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03576 case econv_incomplete_input: return sym_incomplete_input;
03577 case econv_undefined_conversion: return sym_undefined_conversion;
03578 case econv_destination_buffer_full: return sym_destination_buffer_full;
03579 case econv_source_buffer_empty: return sym_source_buffer_empty;
03580 case econv_finished: return sym_finished;
03581 case econv_after_output: return sym_after_output;
03582 default: return INT2NUM(res);
03583 }
03584 }
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601
03602
03603
03604
03605
03606
03607
03608
03609
03610
03611
03612
03613
03614
03615
03616
03617
03618
03619
03620
03621
03622
03623
03624
03625
03626
03627
03628
03629
03630
03631
03632
03633
03634
03635
03636
03637
03638
03639
03640
03641
03642
03643
03644
03645
03646
03647
03648
03649
03650
03651
03652
03653
03654
03655
03656
03657
03658
03659
03660
03661
03662
03663
03664
03665
03666
03667
03668
03669
03670
03671
03672
03673
03674
03675
03676
03677 static VALUE
03678 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03679 {
03680 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03681 rb_econv_t *ec = check_econv(self);
03682 rb_econv_result_t res;
03683 const unsigned char *ip, *is;
03684 unsigned char *op, *os;
03685 long output_byteoffset, output_bytesize;
03686 unsigned long output_byteend;
03687 int flags;
03688
03689 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
03690
03691 if (NIL_P(output_byteoffset_v))
03692 output_byteoffset = 0;
03693 else
03694 output_byteoffset = NUM2LONG(output_byteoffset_v);
03695
03696 if (NIL_P(output_bytesize_v))
03697 output_bytesize = 0;
03698 else
03699 output_bytesize = NUM2LONG(output_bytesize_v);
03700
03701 if (!NIL_P(flags_v)) {
03702 if (!NIL_P(opt)) {
03703 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..5)",
03704 argc + 1);
03705 }
03706 flags = NUM2INT(rb_to_int(flags_v));
03707 }
03708 else if (!NIL_P(opt)) {
03709 VALUE v;
03710 flags = 0;
03711 v = rb_hash_aref(opt, sym_partial_input);
03712 if (RTEST(v))
03713 flags |= ECONV_PARTIAL_INPUT;
03714 v = rb_hash_aref(opt, sym_after_output);
03715 if (RTEST(v))
03716 flags |= ECONV_AFTER_OUTPUT;
03717 }
03718 else {
03719 flags = 0;
03720 }
03721
03722 StringValue(output);
03723 if (!NIL_P(input))
03724 StringValue(input);
03725 rb_str_modify(output);
03726
03727 if (NIL_P(output_bytesize_v)) {
03728 output_bytesize = RSTRING_EMBED_LEN_MAX;
03729 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03730 output_bytesize = RSTRING_LEN(input);
03731 }
03732
03733 retry:
03734
03735 if (NIL_P(output_byteoffset_v))
03736 output_byteoffset = RSTRING_LEN(output);
03737
03738 if (output_byteoffset < 0)
03739 rb_raise(rb_eArgError, "negative output_byteoffset");
03740
03741 if (RSTRING_LEN(output) < output_byteoffset)
03742 rb_raise(rb_eArgError, "output_byteoffset too big");
03743
03744 if (output_bytesize < 0)
03745 rb_raise(rb_eArgError, "negative output_bytesize");
03746
03747 output_byteend = (unsigned long)output_byteoffset +
03748 (unsigned long)output_bytesize;
03749
03750 if (output_byteend < (unsigned long)output_byteoffset ||
03751 LONG_MAX < output_byteend)
03752 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03753
03754 if (rb_str_capacity(output) < output_byteend)
03755 rb_str_resize(output, output_byteend);
03756
03757 if (NIL_P(input)) {
03758 ip = is = NULL;
03759 }
03760 else {
03761 ip = (const unsigned char *)RSTRING_PTR(input);
03762 is = ip + RSTRING_LEN(input);
03763 }
03764
03765 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03766 os = op + output_bytesize;
03767
03768 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03769 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03770 if (!NIL_P(input))
03771 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03772
03773 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03774 if (LONG_MAX / 2 < output_bytesize)
03775 rb_raise(rb_eArgError, "too long conversion result");
03776 output_bytesize *= 2;
03777 output_byteoffset_v = Qnil;
03778 goto retry;
03779 }
03780
03781 if (ec->destination_encoding) {
03782 rb_enc_associate(output, ec->destination_encoding);
03783 }
03784
03785 return econv_result_to_symbol(res);
03786 }
03787
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801
03802
03803
03804
03805
03806
03807
03808
03809
03810
03811
03812
03813
03814
03815
03816
03817
03818
03819
03820
03821
03822 static VALUE
03823 econv_convert(VALUE self, VALUE source_string)
03824 {
03825 VALUE ret, dst;
03826 VALUE av[5];
03827 int ac;
03828 rb_econv_t *ec = check_econv(self);
03829
03830 StringValue(source_string);
03831
03832 dst = rb_str_new(NULL, 0);
03833
03834 av[0] = rb_str_dup(source_string);
03835 av[1] = dst;
03836 av[2] = Qnil;
03837 av[3] = Qnil;
03838 av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03839 ac = 5;
03840
03841 ret = econv_primitive_convert(ac, av, self);
03842
03843 if (ret == sym_invalid_byte_sequence ||
03844 ret == sym_undefined_conversion ||
03845 ret == sym_incomplete_input) {
03846 VALUE exc = make_econv_exception(ec);
03847 rb_exc_raise(exc);
03848 }
03849
03850 if (ret == sym_finished) {
03851 rb_raise(rb_eArgError, "converter already finished");
03852 }
03853
03854 if (ret != sym_source_buffer_empty) {
03855 rb_bug("unexpected result of econv_primitive_convert");
03856 }
03857
03858 return dst;
03859 }
03860
03861
03862
03863
03864
03865
03866
03867
03868
03869
03870
03871
03872 static VALUE
03873 econv_finish(VALUE self)
03874 {
03875 VALUE ret, dst;
03876 VALUE av[5];
03877 int ac;
03878 rb_econv_t *ec = check_econv(self);
03879
03880 dst = rb_str_new(NULL, 0);
03881
03882 av[0] = Qnil;
03883 av[1] = dst;
03884 av[2] = Qnil;
03885 av[3] = Qnil;
03886 av[4] = INT2NUM(0);
03887 ac = 5;
03888
03889 ret = econv_primitive_convert(ac, av, self);
03890
03891 if (ret == sym_invalid_byte_sequence ||
03892 ret == sym_undefined_conversion ||
03893 ret == sym_incomplete_input) {
03894 VALUE exc = make_econv_exception(ec);
03895 rb_exc_raise(exc);
03896 }
03897
03898 if (ret != sym_finished) {
03899 rb_bug("unexpected result of econv_primitive_convert");
03900 }
03901
03902 return dst;
03903 }
03904
03905
03906
03907
03908
03909
03910
03911
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930
03931
03932
03933
03934
03935
03936
03937
03938
03939
03940
03941
03942
03943
03944
03945
03946
03947
03948
03949
03950
03951
03952
03953
03954
03955
03956
03957
03958
03959
03960
03961
03962
03963
03964
03965
03966
03967
03968
03969
03970
03971
03972
03973
03974
03975
03976
03977
03978
03979
03980 static VALUE
03981 econv_primitive_errinfo(VALUE self)
03982 {
03983 rb_econv_t *ec = check_econv(self);
03984
03985 VALUE ary;
03986
03987 ary = rb_ary_new2(5);
03988
03989 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03990 rb_ary_store(ary, 4, Qnil);
03991
03992 if (ec->last_error.source_encoding)
03993 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03994
03995 if (ec->last_error.destination_encoding)
03996 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03997
03998 if (ec->last_error.error_bytes_start) {
03999 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
04000 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
04001 }
04002
04003 return ary;
04004 }
04005
04006
04007
04008
04009
04010
04011
04012
04013
04014
04015
04016
04017
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030
04031
04032
04033
04034
04035
04036
04037
04038 static VALUE
04039 econv_insert_output(VALUE self, VALUE string)
04040 {
04041 const char *insert_enc;
04042
04043 int ret;
04044
04045 rb_econv_t *ec = check_econv(self);
04046
04047 StringValue(string);
04048 insert_enc = rb_econv_encoding_to_insert_output(ec);
04049 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
04050
04051 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
04052 if (ret == -1) {
04053 rb_raise(rb_eArgError, "too big string");
04054 }
04055
04056 return Qnil;
04057 }
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069
04070
04071
04072
04073
04074
04075
04076
04077
04078
04079
04080
04081
04082
04083 static VALUE
04084 econv_putback(int argc, VALUE *argv, VALUE self)
04085 {
04086 rb_econv_t *ec = check_econv(self);
04087 int n;
04088 int putbackable;
04089 VALUE str, max;
04090
04091 rb_scan_args(argc, argv, "01", &max);
04092
04093 if (NIL_P(max))
04094 n = rb_econv_putbackable(ec);
04095 else {
04096 n = NUM2INT(max);
04097 putbackable = rb_econv_putbackable(ec);
04098 if (putbackable < n)
04099 n = putbackable;
04100 }
04101
04102 str = rb_str_new(NULL, n);
04103 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
04104
04105 if (ec->source_encoding) {
04106 rb_enc_associate(str, ec->source_encoding);
04107 }
04108
04109 return str;
04110 }
04111
04112
04113
04114
04115
04116
04117
04118
04119
04120
04121
04122
04123
04124
04125
04126
04127
04128
04129
04130
04131
04132 static VALUE
04133 econv_last_error(VALUE self)
04134 {
04135 rb_econv_t *ec = check_econv(self);
04136 VALUE exc;
04137
04138 exc = make_econv_exception(ec);
04139 if (NIL_P(exc))
04140 return Qnil;
04141 return exc;
04142 }
04143
04144
04145
04146
04147
04148
04149
04150
04151
04152
04153
04154
04155
04156 static VALUE
04157 econv_get_replacement(VALUE self)
04158 {
04159 rb_econv_t *ec = check_econv(self);
04160 int ret;
04161 rb_encoding *enc;
04162
04163 ret = make_replacement(ec);
04164 if (ret == -1) {
04165 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04166 }
04167
04168 enc = rb_enc_find(ec->replacement_enc);
04169 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04170 }
04171
04172
04173
04174
04175
04176
04177
04178
04179
04180
04181
04182 static VALUE
04183 econv_set_replacement(VALUE self, VALUE arg)
04184 {
04185 rb_econv_t *ec = check_econv(self);
04186 VALUE string = arg;
04187 int ret;
04188 rb_encoding *enc;
04189
04190 StringValue(string);
04191 enc = rb_enc_get(string);
04192
04193 ret = rb_econv_set_replacement(ec,
04194 (const unsigned char *)RSTRING_PTR(string),
04195 RSTRING_LEN(string),
04196 rb_enc_name(enc));
04197
04198 if (ret == -1) {
04199
04200 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04201 }
04202
04203 return arg;
04204 }
04205
04206 VALUE
04207 rb_econv_make_exception(rb_econv_t *ec)
04208 {
04209 return make_econv_exception(ec);
04210 }
04211
04212 void
04213 rb_econv_check_error(rb_econv_t *ec)
04214 {
04215 VALUE exc;
04216
04217 exc = make_econv_exception(ec);
04218 if (NIL_P(exc))
04219 return;
04220 rb_exc_raise(exc);
04221 }
04222
04223
04224
04225
04226
04227
04228
04229 static VALUE
04230 ecerr_source_encoding_name(VALUE self)
04231 {
04232 return rb_attr_get(self, rb_intern("source_encoding_name"));
04233 }
04234
04235
04236
04237
04238
04239
04240
04241
04242
04243
04244
04245
04246
04247
04248
04249
04250
04251
04252
04253
04254
04255 static VALUE
04256 ecerr_source_encoding(VALUE self)
04257 {
04258 return rb_attr_get(self, rb_intern("source_encoding"));
04259 }
04260
04261
04262
04263
04264
04265
04266
04267 static VALUE
04268 ecerr_destination_encoding_name(VALUE self)
04269 {
04270 return rb_attr_get(self, rb_intern("destination_encoding_name"));
04271 }
04272
04273
04274
04275
04276
04277
04278
04279 static VALUE
04280 ecerr_destination_encoding(VALUE self)
04281 {
04282 return rb_attr_get(self, rb_intern("destination_encoding"));
04283 }
04284
04285
04286
04287
04288
04289
04290
04291
04292
04293
04294
04295
04296
04297
04298
04299
04300 static VALUE
04301 ecerr_error_char(VALUE self)
04302 {
04303 return rb_attr_get(self, rb_intern("error_char"));
04304 }
04305
04306
04307
04308
04309
04310
04311
04312
04313
04314
04315
04316
04317
04318
04319
04320
04321 static VALUE
04322 ecerr_error_bytes(VALUE self)
04323 {
04324 return rb_attr_get(self, rb_intern("error_bytes"));
04325 }
04326
04327
04328
04329
04330
04331
04332
04333 static VALUE
04334 ecerr_readagain_bytes(VALUE self)
04335 {
04336 return rb_attr_get(self, rb_intern("readagain_bytes"));
04337 }
04338
04339
04340
04341
04342
04343
04344
04345
04346
04347
04348
04349
04350
04351
04352
04353
04354
04355
04356
04357
04358
04359
04360
04361
04362
04363 static VALUE
04364 ecerr_incomplete_input(VALUE self)
04365 {
04366 return rb_attr_get(self, rb_intern("incomplete_input"));
04367 }
04368
04369
04370
04371
04372
04373
04374
04375
04376
04377
04378
04379
04380
04381
04382
04383
04384
04385
04386
04387
04388
04389
04390
04391 void
04392 Init_transcode(void)
04393 {
04394 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04395 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04396 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04397
04398 transcoder_table = st_init_strcasetable();
04399
04400 sym_invalid = ID2SYM(rb_intern("invalid"));
04401 sym_undef = ID2SYM(rb_intern("undef"));
04402 sym_replace = ID2SYM(rb_intern("replace"));
04403 sym_fallback = ID2SYM(rb_intern("fallback"));
04404 sym_aref = ID2SYM(rb_intern("[]"));
04405 sym_xml = ID2SYM(rb_intern("xml"));
04406 sym_text = ID2SYM(rb_intern("text"));
04407 sym_attr = ID2SYM(rb_intern("attr"));
04408
04409 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04410 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04411 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04412 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04413 sym_finished = ID2SYM(rb_intern("finished"));
04414 sym_after_output = ID2SYM(rb_intern("after_output"));
04415 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04416 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04417 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04418 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04419 sym_partial_input = ID2SYM(rb_intern("partial_input"));
04420
04421 #ifdef ENABLE_ECONV_NEWLINE_OPTION
04422 sym_newline = ID2SYM(rb_intern("newline"));
04423 sym_universal = ID2SYM(rb_intern("universal"));
04424 sym_crlf = ID2SYM(rb_intern("crlf"));
04425 sym_cr = ID2SYM(rb_intern("cr"));
04426 sym_lf = ID2SYM(rb_intern("lf"));
04427 #endif
04428
04429 rb_define_method(rb_cString, "encode", str_encode, -1);
04430 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04431
04432 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04433 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04434 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04435 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04436 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04437 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04438 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04439 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04440 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04441 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04442 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04443 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04444 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04445 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04446 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04447 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04448 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04449 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04450 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
04451
04452 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04453 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04454 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04455 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04456 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04457 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04458 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04459 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04460 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04461 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04462 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04463 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04464 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04465
04466 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04467 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04468 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04469 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04470 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04471
04472 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04473 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04474 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04475 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04476 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04477 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04478 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04479
04480 Init_newline();
04481 }
04482