類別 Encoding::Converter
Encoding
轉換類別。
常數
- AFTER_OUTPUT
-
在某些輸出完成後,但在使用所有輸入之前停止轉換。請參閱
primitive_convert
以取得範例。 - CRLF_NEWLINE_DECORATOR
-
用於將 LF 轉換為 CRLF 的裝飾器
- CR_NEWLINE_DECORATOR
-
用於將 LF 轉換為 CR 的裝飾器
- INVALID_MASK
-
無效位元組順序的遮罩
- INVALID_REPLACE
-
取代無效位元組順序
- LF_NEWLINE_DECORATOR
-
在寫入時用於將 CRLF 和 CR 轉換為 LF 的裝飾器
- PARTIAL_INPUT
-
表示來源可能是較大字串的一部分。請參閱
primitive_convert
以取得範例。 - UNDEF_HEX_CHARREF
-
使用 XML 十六進位字元參考取代在目標編碼中未定義的位元組順序。這對於 XML 轉換是有效的。
- UNDEF_MASK
-
來源編碼中有效字元的遮罩,但在目標編碼中沒有相關字元。
- UNDEF_REPLACE
-
取代在目標編碼中未定義的位元組順序。
- UNIVERSAL_NEWLINE_DECORATOR
-
用於將 CRLF 和 CR 轉換為 LF 的裝飾器
- XML_ATTR_CONTENT_DECORATOR
-
以 XML AttValue 方式進行跳脫
- XML_ATTR_QUOTE_DECORATOR
-
以 XML AttValue 方式進行跳脫
- XML_TEXT_DECORATOR
-
以 XML CharData 方式進行跳脫
公開類別方法
傳回對應的 ASCII 相容編碼。
如果參數是 ASCII 相容編碼,則傳回 nil。
“對應的 ASCII 相容編碼”是 ASCII 相容編碼,它可以表示與給定的 ASCII 不相容編碼完全相同的字元。因此,在兩個編碼之間進行轉換時,不會發生未定義的轉換錯誤。
Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg) { const char *arg_name, *result_name; rb_encoding *arg_enc, *result_enc; enc_arg(&arg, &arg_name, &arg_enc); result_name = rb_econv_asciicompat_encoding(arg_name); if (result_name == NULL) return Qnil; result_enc = make_encoding(result_name); return rb_enc_from_encoding(result_enc); }
可能的選項元素
hash form: :invalid => nil # raise error on invalid byte sequence (default) :invalid => :replace # replace invalid byte sequence :undef => nil # raise error on undefined conversion (default) :undef => :replace # replace undefined conversion :replace => string # replacement string ("?" or "\uFFFD" if not specified) :newline => :universal # decorator for converting CRLF and CR to LF :newline => :lf # decorator for converting CRLF and CR to LF when writing :newline => :crlf # decorator for converting LF to CRLF :newline => :cr # decorator for converting LF to CR :universal_newline => true # decorator for converting CRLF and CR to LF :crlf_newline => true # decorator for converting LF to CRLF :cr_newline => true # decorator for converting LF to CR :lf_newline => true # decorator for converting CRLF and CR to LF when writing :xml => :text # escape as XML CharData. :xml => :attr # escape as XML AttValue integer form: Encoding::Converter::INVALID_REPLACE Encoding::Converter::UNDEF_REPLACE Encoding::Converter::UNDEF_HEX_CHARREF Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR Encoding::Converter::LF_NEWLINE_DECORATOR Encoding::Converter::CRLF_NEWLINE_DECORATOR Encoding::Converter::CR_NEWLINE_DECORATOR Encoding::Converter::XML_TEXT_DECORATOR Encoding::Converter::XML_ATTR_CONTENT_DECORATOR Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
Encoding::Converter.new
建立 Encoding::Converter
的執行個體。
source_encoding
和 destination_encoding
應為字串或 Encoding
物件。
opt 應為 nil、雜湊或整數。
convpath 應為陣列。convpath 可能包含
-
包含編碼或編碼名稱的二元素陣列,或
-
代表裝飾器名稱的字串。
Encoding::Converter.new
選擇性地採用選項。選項應為雜湊或整數。選項雜湊可以包含 :invalid => nil 等。選項整數應為常數的邏輯或,例如 Encoding::Converter::INVALID_REPLACE
等。
- :invalid => nil
-
針對無效的位元組序列引發錯誤。這是預設行為。
- :invalid => :replace
-
以替換字串替換無效的位元組序列。
- :undef => nil
-
如果
source_encoding
中的字元未定義在 destination_encoding 中,則引發錯誤。這是預設行為。 - :undef => :replace
-
以替換字串替換
destination_encoding
中未定義的字元。 - :replace => 字串
-
指定替換字串。如果未指定,則對 Unicode 編碼使用「uFFFD」,對其他編碼使用「?」。
- :universal_newline => true
-
將 CRLF 和 CR 轉換為 LF。
- :crlf_newline => true
-
將 LF 轉換為 CRLF。
- :cr_newline => true
-
將 LF 轉換為 CR。
- :lf_newline => true
-
將 CRLF 和 CR 轉換為 LF(寫入時)。
- :xml => :text
-
作為 XML CharData 進行跳脫。此表單可用作 HTML 4.0 PCDATA。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
destination_encoding
中未定義的字元 -> 十六進位 CharRef,例如 &#xHH;
-
- :xml => :attr
-
作為 XML AttValue 進行跳脫。轉換結果會加上引號「…」標示。此表單可用作 HTML 4.0 屬性值。
-
‘&’ -> ‘&’
-
‘<’ -> ‘<’
-
‘>’ -> ‘>’
-
‘“’ -> ‘"’
-
destination_encoding
中未定義的字元 -> 十六進位 CharRef,例如 &#xHH;
-
範例
# UTF-16BE to UTF-8 ec = Encoding::Converter.new("UTF-16BE", "UTF-8") # Usually, decorators such as newline conversion are inserted last. ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], # "universal_newline"] # But, if the last encoding is ASCII incompatible, # decorators are inserted before the last conversion. ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) p ec.convpath #=> ["crlf_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] # Conversion path can be specified directly. ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) p ec.convpath #=> ["universal_newline", # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
static VALUE econv_init(int argc, VALUE *argv, VALUE self) { VALUE ecopts; VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; rb_econv_t *ec; int ecflags; VALUE convpath; if (rb_check_typeddata(self, &econv_data_type)) { rb_raise(rb_eTypeError, "already initialized"); } if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) { ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc); ecflags = 0; ecopts = Qnil; } else { econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); } if (!ec) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (!DECORATOR_P(sname, dname)) { if (!senc) senc = make_dummy_encoding(sname); if (!denc) denc = make_dummy_encoding(dname); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); } ec->source_encoding = senc; ec->destination_encoding = denc; DATA_PTR(self) = ec; return self; }
傳回轉換路徑。
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "universal_newline"] p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) or p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # "universal_newline", # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) { VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; int ecflags; VALUE ecopts; VALUE convpath; econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts); convpath = Qnil; transcode_search_path(sname, dname, search_convpath_i, &convpath); if (NIL_P(convpath)) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } if (decorate_convpath(convpath, ecflags) == -1) { VALUE exc = rb_econv_open_exc(sname, dname, ecflags); RB_GC_GUARD(snamev); RB_GC_GUARD(dnamev); rb_exc_raise(exc); } return convpath; }
公共實例方法
static VALUE econv_equal(VALUE self, VALUE other) { rb_econv_t *ec1 = check_econv(self); rb_econv_t *ec2; int i; if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { return Qnil; } ec2 = DATA_PTR(other); if (!ec2) return Qfalse; if (ec1->source_encoding_name != ec2->source_encoding_name && strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) return Qfalse; if (ec1->destination_encoding_name != ec2->destination_encoding_name && strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) return Qfalse; if (ec1->flags != ec2->flags) return Qfalse; if (ec1->replacement_enc != ec2->replacement_enc && strcmp(ec1->replacement_enc, ec2->replacement_enc)) return Qfalse; if (ec1->replacement_len != ec2->replacement_len) return Qfalse; if (ec1->replacement_str != ec2->replacement_str && memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) return Qfalse; if (ec1->num_trans != ec2->num_trans) return Qfalse; for (i = 0; i < ec1->num_trans; i++) { if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) return Qfalse; } return Qtrue; }
轉換 source_string 並傳回 destination_string。
假設 source_string 是 source 的一部分。即:內部指定 :partial_input=>true。finish 方法應最後使用。
ec = Encoding::Converter.new("utf-8", "euc-jp") puts ec.convert("\u3042").dump #=> "\xA4\xA2" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("euc-jp", "utf-8") puts ec.convert("\xA4").dump #=> "" puts ec.convert("\xA2").dump #=> "\xE3\x81\x82" puts ec.finish.dump #=> "" ec = Encoding::Converter.new("utf-8", "iso-2022-jp") puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP") puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP") puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
如果發生轉換錯誤,會引發 Encoding::UndefinedConversionError
或 Encoding::InvalidByteSequenceError
。 Encoding::Converter#convert
沒有提供從這些例外狀況中復原或重新啟動的方法。當您想要處理這些轉換錯誤時,請使用 Encoding::Converter#primitive_convert
。
static VALUE econv_convert(VALUE self, VALUE source_string) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); StringValue(source_string); dst = rb_str_new(NULL, 0); av[0] = rb_str_dup(source_string); av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2NUM(ECONV_PARTIAL_INPUT); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret == sym_finished) { rb_raise(rb_eArgError, "converter already finished"); } if (ret != sym_source_buffer_empty) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
傳回 ec 的轉換路徑。
結果是轉換的陣列。
ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) p ec.convpath #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], # "crlf_newline"]
陣列的每個元素都是編碼或字串的配對。配對表示編碼轉換。字串表示裝飾器。
在上述範例中,[#<Encoding:ISO-8859-1>, static VALUE
econv_convpath(VALUE self)
{
rb_econv_t *ec = check_econv(self);
VALUE result;
int i;
result = rb_ary_new();
for (i = 0; i < ec->num_trans; i++) {
const rb_transcoder *tr = ec->elems[i].tc->transcoder;
VALUE v;
if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
v = rb_str_new_cstr(tr->dst_encoding);
else
v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
rb_ary_push(result, v);
}
return result;
}
傳回目標編碼為 Encoding
物件。
static VALUE econv_destination_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->destination_encoding); }
完成轉換器。它會傳回已轉換字串的最後部分。
ec = Encoding::Converter.new("utf-8", "iso-2022-jp") p ec.convert("\u3042") #=> "\e$B$\"" p ec.finish #=> "\e(B"
static VALUE econv_finish(VALUE self) { VALUE ret, dst; VALUE av[5]; int ac; rb_econv_t *ec = check_econv(self); dst = rb_str_new(NULL, 0); av[0] = Qnil; av[1] = dst; av[2] = Qnil; av[3] = Qnil; av[4] = INT2FIX(0); ac = 5; ret = econv_primitive_convert(ac, av, self); if (ret == sym_invalid_byte_sequence || ret == sym_undefined_conversion || ret == sym_incomplete_input) { VALUE exc = make_econv_exception(ec); rb_exc_raise(exc); } if (ret != sym_finished) { rb_bug("unexpected result of econv_primitive_convert"); } return dst; }
將字串插入編碼轉換器中。字串會轉換為目標編碼,並在後續轉換中輸出。
如果目標編碼有狀態,字串會根據狀態轉換,並更新狀態。
此方法應僅在發生轉換錯誤時使用。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") src = "HIRAGANA LETTER A is \u{3042}." dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."] ec.insert_output("<err>") p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""] ec = Encoding::Converter.new("utf-8", "iso-2022-jp") src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp dst = "" p ec.primitive_convert(src, dst) #=> :undefined_conversion puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"] ec.insert_output "?" # state change required to output "?". p ec.primitive_convert(src, dst) #=> :finished puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
static VALUE econv_insert_output(VALUE self, VALUE string) { const char *insert_enc; int ret; rb_econv_t *ec = check_econv(self); StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { rb_raise(rb_eArgError, "too big string"); } return Qnil; }
傳回 ec 的可列印版本
ec = Encoding::Converter.new("iso-8859-1", "utf-8") puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
static VALUE econv_inspect(VALUE self) { const char *cname = rb_obj_classname(self); rb_econv_t *ec; TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); if (!ec) return rb_sprintf("#<%s: uninitialized>", cname); else { const char *sname = ec->source_encoding_name; const char *dname = ec->destination_encoding_name; VALUE str; str = rb_sprintf("#<%s: ", cname); econv_description(sname, dname, ec->flags, str); rb_str_cat2(str, ">"); return str; } }
傳回最後一次轉換的例外狀況物件。如果最後一次轉換未產生錯誤,則傳回 nil。
「錯誤」表示 Encoding::InvalidByteSequenceError
和 Encoding::UndefinedConversionError
適用於 Encoding::Converter#convert
,而 :invalid_byte_sequence、:incomplete_input 和 :undefined_conversion 適用於 Encoding::Converter#primitive_convert
。
ec = Encoding::Converter.new("utf-8", "iso-8859-1") p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full p ec.last_error #=> nil
static VALUE econv_last_error(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE exc; exc = make_econv_exception(ec); if (NIL_P(exc)) return Qnil; return exc; }
可能的 opt 元素
hash form: :partial_input => true # source buffer may be part of larger source :after_output => true # stop conversion after output before input integer form: Encoding::Converter::PARTIAL_INPUT Encoding::Converter::AFTER_OUTPUT
可能的結果
:invalid_byte_sequence :incomplete_input :undefined_conversion :after_output :destination_buffer_full :source_buffer_empty :finished
primitive_convert
將 source_buffer 轉換成 destination_buffer。
source_buffer 應為字串或 nil。nil 表示空字串。
destination_buffer 應為字串。
destination_byteoffset 應為整數或 nil。nil 表示 destination_buffer 的結尾。如果省略,則假設為 nil。
destination_bytesize 應為整數或 nil。nil 表示無限制。如果省略,則假設為 nil。
opt 應為 nil、雜湊或整數。nil 表示沒有旗標。如果省略,則假設為 nil。
primitive_convert
將 source_buffer 的內容從開頭轉換,並將結果儲存在 destination_buffer 中。
destination_byteoffset 和 destination_bytesize 指定轉換結果儲存的區域。destination_byteoffset 以位元組為單位,指定 destination_buffer 中的起始位置。如果 destination_byteoffset 為 nil,則 destination_buffer.bytesize 會用於附加結果。destination_bytesize 指定最大位元組數。如果 destination_bytesize 為 nil,則 destination 大小無限制。轉換後,destination_buffer 會調整大小為 destination_byteoffset + 實際產生的位元組數。destination_buffer 的編碼也會設定為 destination_encoding。
primitive_convert
會捨棄 source_buffer 已轉換的部分。已捨棄的部分會轉換成 destination_buffer 或緩衝在 Encoding::Converter
物件中。
primitive_convert
會在符合下列條件之一時停止轉換。
-
在來源緩衝區中找到無效的位元組順序 (:invalid_byte_sequence)
primitive_errinfo
和last_error
方法會傳回錯誤的詳細資料。 -
來源緩衝區意外結束 (:incomplete_input) 僅在未指定 :partial_input 時發生。
primitive_errinfo
和last_error
方法會傳回錯誤的詳細資料。 -
字元無法在輸出編碼中表示 (:undefined_conversion)
primitive_errinfo
和last_error
方法會傳回錯誤的詳細資料。 -
在產生一些輸出後,在輸入完成之前 (:after_output) 僅在指定 :after_output 時發生。
-
目的地緩衝區已滿 (:destination_buffer_full) 僅在 destination_bytesize 非 nil 時發生。
-
來源緩衝區為空 (:source_buffer_empty) 僅在指定 :partial_input 時發生。
-
轉換已完成 (:finished)
範例
ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 100) p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:finished, "", "i"]
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self) { VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; rb_econv_t *ec = check_econv(self); rb_econv_result_t res; const unsigned char *ip, *is; unsigned char *op, *os; long output_byteoffset, output_bytesize; unsigned long output_byteend; int flags; argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); if (NIL_P(output_byteoffset_v)) output_byteoffset = 0; /* dummy */ else output_byteoffset = NUM2LONG(output_byteoffset_v); if (NIL_P(output_bytesize_v)) output_bytesize = 0; /* dummy */ else output_bytesize = NUM2LONG(output_bytesize_v); if (!NIL_P(flags_v)) { if (!NIL_P(opt)) { rb_error_arity(argc + 1, 2, 5); } flags = NUM2INT(rb_to_int(flags_v)); } else if (!NIL_P(opt)) { VALUE v; flags = 0; v = rb_hash_aref(opt, sym_partial_input); if (RTEST(v)) flags |= ECONV_PARTIAL_INPUT; v = rb_hash_aref(opt, sym_after_output); if (RTEST(v)) flags |= ECONV_AFTER_OUTPUT; } else { flags = 0; } StringValue(output); if (!NIL_P(input)) StringValue(input); rb_str_modify(output); if (NIL_P(output_bytesize_v)) { output_bytesize = rb_str_capacity(output); if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input); } retry: if (NIL_P(output_byteoffset_v)) output_byteoffset = RSTRING_LEN(output); if (output_byteoffset < 0) rb_raise(rb_eArgError, "negative output_byteoffset"); if (RSTRING_LEN(output) < output_byteoffset) rb_raise(rb_eArgError, "output_byteoffset too big"); if (output_bytesize < 0) rb_raise(rb_eArgError, "negative output_bytesize"); output_byteend = (unsigned long)output_byteoffset + (unsigned long)output_bytesize; if (output_byteend < (unsigned long)output_byteoffset || LONG_MAX < output_byteend) rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"); if (rb_str_capacity(output) < output_byteend) rb_str_resize(output, output_byteend); if (NIL_P(input)) { ip = is = NULL; } else { ip = (const unsigned char *)RSTRING_PTR(input); is = ip + RSTRING_LEN(input); } op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; os = op + output_bytesize; res = rb_econv_convert(ec, &ip, is, &op, os, flags); rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); if (!NIL_P(input)) { rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); } if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { if (LONG_MAX / 2 < output_bytesize) rb_raise(rb_eArgError, "too long conversion result"); output_bytesize *= 2; output_byteoffset_v = Qnil; goto retry; } if (ec->destination_encoding) { rb_enc_associate(output, ec->destination_encoding); } return econv_result_to_symbol(res); }
primitive_errinfo
會傳回有關最後一個錯誤的重要資訊,作為 5 個元素的陣列
[result, enc1, enc2, error_bytes, readagain_bytes]
result 是 primitive_convert 的最後結果。
其他元素僅在 result 為 :invalid_byte_sequence、:incomplete_input 或 :undefined_conversion 時才有意義。
enc1 和 enc2 會將轉換步驟表示為一對字串。例如,從 EUC-JP 轉換至 ISO-8859-1 的轉換器會將字串轉換如下:EUC-JP -> UTF-8 -> ISO-8859-1。因此 [enc1, enc2] 會是 [“EUC-JP”, “UTF-8”] 或 [“UTF-8”, “ISO-8859-1”]。
error_bytes 和 readagain_bytes 會指出導致錯誤的位元組順序。error_bytes 是已捨棄的部分。readagain_bytes 是已緩衝的部分,會在下次轉換時再次讀取。
範例
# \xff is invalid as EUC-JP. ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") ec.primitive_convert(src="\xff", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. # Since this error is occur in UTF-8 to ISO-8859-1 conversion, # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) p ec.primitive_errinfo #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] # partial character is invalid ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10) p ec.primitive_errinfo #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by # partial characters. ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) p ec.primitive_errinfo #=> [:source_buffer_empty, nil, nil, nil, nil] # \xd8\x00\x00@ is invalid as UTF-16BE because # no low surrogate after high surrogate (\xd8\x00). # It is detected by 3rd byte (\00) which is part of next character. # So the high surrogate (\xd8\x00) is discarded and # the 3rd byte is read again later. # Since the byte is buffered in ec, it is dropped from src. ec = Encoding::Converter.new("UTF-16BE", "UTF-8") ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] p src #=> "@" # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. # The problem is detected by 4th byte. ec = Encoding::Converter.new("UTF-16LE", "UTF-8") ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] p src #=> ""
static VALUE econv_primitive_errinfo(VALUE self) { rb_econv_t *ec = check_econv(self); VALUE ary; ary = rb_ary_new2(5); rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); rb_ary_store(ary, 4, Qnil); if (ec->last_error.source_encoding) rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); if (ec->last_error.destination_encoding) rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); if (ec->last_error.error_bytes_start) { rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); } return ary; }
放回將轉換的位元組。
這些位元組是由 invalid_byte_sequence 錯誤造成的。發生 invalid_byte_sequence 錯誤時,有些位元組會被捨棄,有些位元組會被緩衝起來,以便稍後轉換。後面的位元組可以放回。這可以用 Encoding::InvalidByteSequenceError#readagain_bytes
和 Encoding::Converter#primitive_errinfo
來觀察。
ec = Encoding::Converter.new("utf-16le", "iso-8859-1") src = "\x00\xd8\x61\x00" dst = "" p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] p ec.putback #=> "a\x00" p ec.putback #=> "" # no more bytes to put back
static VALUE econv_putback(int argc, VALUE *argv, VALUE self) { rb_econv_t *ec = check_econv(self); int n; int putbackable; VALUE str, max; if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) { n = rb_econv_putbackable(ec); } else { n = NUM2INT(max); putbackable = rb_econv_putbackable(ec); if (putbackable < n) n = putbackable; } str = rb_str_new(NULL, n); rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n); if (ec->source_encoding) { rb_enc_associate(str, ec->source_encoding); } return str; }
傳回替換字串。
ec = Encoding::Converter.new("euc-jp", "us-ascii") p ec.replacement #=> "?" ec = Encoding::Converter.new("euc-jp", "utf-8") p ec.replacement #=> "\uFFFD"
static VALUE econv_get_replacement(VALUE self) { rb_econv_t *ec = check_econv(self); int ret; rb_encoding *enc; ret = make_replacement(ec); if (ret == -1) { rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } enc = rb_enc_find(ec->replacement_enc); return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc); }
設定替換字串。
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) ec.replacement = "<undef>" p ec.convert("a \u3042 b") #=> "a <undef> b"
static VALUE econv_set_replacement(VALUE self, VALUE arg) { rb_econv_t *ec = check_econv(self); VALUE string = arg; int ret; rb_encoding *enc; StringValue(string); enc = rb_enc_get(string); ret = rb_econv_set_replacement(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), rb_enc_name(enc)); if (ret == -1) { /* xxx: rb_eInvalidByteSequenceError? */ rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } return arg; }
傳回來源編碼,作為 Encoding
物件。
static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); return econv_get_encoding(ec->source_encoding); }