Feature #11094 ยป 0001-enc-utf_8.c-pack.c-limit-UTF-8.patch
| enc/utf_8.c | ||
|---|---|---|
| /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ | ||
| #define INVALID_CODE_FE   0xfffffffe | ||
| #define INVALID_CODE_FF   0xffffffff | ||
| #define VALID_CODE_LIMIT  0x7fffffff | ||
| #endif | ||
| #define VALID_CODE_LIMIT  0x0010ffff | ||
| #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80) | ||
| ... | ... | |
|   if      ((code & 0xffffff80) == 0) return 1; | ||
|   else if ((code & 0xfffff800) == 0) return 2; | ||
|   else if ((code & 0xffff0000) == 0) return 3; | ||
|   else if ((code & 0xffe00000) == 0) return 4; | ||
|   else if ((code & 0xfc000000) == 0) return 5; | ||
|   else if ((code & 0x80000000) == 0) return 6; | ||
|   else if (code <= VALID_CODE_LIMIT) return 4; | ||
| #ifdef USE_INVALID_CODE_SCHEME | ||
|   else if (code == INVALID_CODE_FE) return 1; | ||
|   else if (code == INVALID_CODE_FF) return 1; | ||
| ... | ... | |
|       *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); | ||
|       *p++ = UTF8_TRAILS(code, 6); | ||
|     } | ||
|     else if ((code & 0xffe00000) == 0) { | ||
|     else if (code <= VALID_CODE_LIMIT) { | ||
|       *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); | ||
|       *p++ = UTF8_TRAILS(code, 12); | ||
|       *p++ = UTF8_TRAILS(code,  6); | ||
|     } | ||
|     else if ((code & 0xfc000000) == 0) { | ||
|       *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); | ||
|       *p++ = UTF8_TRAILS(code, 18); | ||
|       *p++ = UTF8_TRAILS(code, 12); | ||
|       *p++ = UTF8_TRAILS(code,  6); | ||
|     } | ||
|     else if ((code & 0x80000000) == 0) { | ||
|       *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); | ||
|       *p++ = UTF8_TRAILS(code, 24); | ||
|       *p++ = UTF8_TRAILS(code, 18); | ||
|       *p++ = UTF8_TRAILS(code, 12); | ||
|       *p++ = UTF8_TRAILS(code,  6); | ||
|     } | ||
| #ifdef USE_INVALID_CODE_SCHEME | ||
|     else if (code == INVALID_CODE_FE) { | ||
|       *p = 0xfe; | ||
| pack.c | ||
|---|---|---|
| 	buf[2] = castchar((uv&0x3f)|0x80); | ||
| 	return 3; | ||
|     } | ||
|     if (uv <= 0x1fffff) { | ||
|     if (uv <= 0x10ffff) { | ||
| 	buf[0] = castchar(((uv>>18)&0xff)|0xf0); | ||
| 	buf[1] = castchar(((uv>>12)&0x3f)|0x80); | ||
| 	buf[2] = castchar(((uv>>6)&0x3f)|0x80); | ||
| 	buf[3] = castchar((uv&0x3f)|0x80); | ||
| 	return 4; | ||
|     } | ||
|     if (uv <= 0x3ffffff) { | ||
| 	buf[0] = castchar(((uv>>24)&0xff)|0xf8); | ||
| 	buf[1] = castchar(((uv>>18)&0x3f)|0x80); | ||
| 	buf[2] = castchar(((uv>>12)&0x3f)|0x80); | ||
| 	buf[3] = castchar(((uv>>6)&0x3f)|0x80); | ||
| 	buf[4] = castchar((uv&0x3f)|0x80); | ||
| 	return 5; | ||
|     } | ||
|     if (uv <= 0x7fffffff) { | ||
| 	buf[0] = castchar(((uv>>30)&0xff)|0xfc); | ||
| 	buf[1] = castchar(((uv>>24)&0x3f)|0x80); | ||
| 	buf[2] = castchar(((uv>>18)&0x3f)|0x80); | ||
| 	buf[3] = castchar(((uv>>12)&0x3f)|0x80); | ||
| 	buf[4] = castchar(((uv>>6)&0x3f)|0x80); | ||
| 	buf[5] = castchar((uv&0x3f)|0x80); | ||
| 	return 6; | ||
|     } | ||
|     rb_raise(rb_eRangeError, "pack(U): value out of range"); | ||
|     UNREACHABLE; | ||
| ... | ... | |
|     0x80,			/* 2 */ | ||
|     0x800,			/* 3 */ | ||
|     0x10000,			/* 4 */ | ||
|     0x200000,			/* 5 */ | ||
|     0x4000000,			/* 6 */ | ||
|     0x80000000,			/* 7 */ | ||
| }; | ||
| static unsigned long | ||
| ... | ... | |
|     if      (!(uv & 0x20)) { n = 2; uv &= 0x1f; } | ||
|     else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; } | ||
|     else if (!(uv & 0x08)) { n = 4; uv &= 0x07; } | ||
|     else if (!(uv & 0x04)) { n = 5; uv &= 0x03; } | ||
|     else if (!(uv & 0x02)) { n = 6; uv &= 0x01; } | ||
|     else { | ||
| 	*lenp = 1; | ||
| 	rb_raise(rb_eArgError, "malformed UTF-8 character"); | ||
| ... | ... | |
|     if (uv < utf8_limits[n]) { | ||
| 	rb_raise(rb_eArgError, "redundant UTF-8 sequence"); | ||
|     } | ||
|     else if (uv > 0x10ffff) { | ||
| 	rb_raise(rb_eRangeError, "unpack(U): value out of range"); | ||
|     } | ||
|     return uv; | ||
| } | ||
| test/ruby/test_integer_comb.rb | ||
|---|---|---|
|   def test_pack_utf8 | ||
|     template = "U" | ||
|     VS.reverse_each {|a| | ||
|       if a < 0 || 0x7fffffff < a | ||
|       if a < 0 || 0x10ffff < a | ||
|         assert_raise(RangeError) { [a].pack(template) } | ||
|       else | ||
|         s = [a].pack(template) | ||
| test/ruby/test_pack.rb | ||
|---|---|---|
|     assert_raise(RangeError) { [-0x40000000].pack("U") } | ||
|     assert_raise(RangeError) { [-1].pack("U") } | ||
|     assert_equal "\000", [0].pack("U") | ||
|     assert_equal "\374\277\277\277\277\277".force_encoding(Encoding::UTF_8), [0x3fffffff].pack("U") | ||
|     assert_equal "\375\200\200\200\200\200".force_encoding(Encoding::UTF_8), [0x40000000].pack("U") | ||
|     assert_equal "\375\277\277\277\277\277".force_encoding(Encoding::UTF_8), [0x7fffffff].pack("U") | ||
|     assert_equal "\364\217\277\277".force_encoding(Encoding::UTF_8), [0x10ffff].pack("U") | ||
|     assert_raise(RangeError) { [0x3fffffff].pack("U") } | ||
|     assert_raise(RangeError) { [0x40000000].pack("U") } | ||
|     assert_raise(RangeError) { [0x110000].pack("U") } | ||
|     assert_raise(RangeError) { [0x80000000].pack("U") } | ||
|     assert_raise(RangeError) { [0x100000000].pack("U") } | ||
|   end | ||
| ... | ... | |
|     assert_equal([0x80], [0x80].pack("U").unpack("U")) | ||
|     assert_equal([0x800], [0x800].pack("U").unpack("U")) | ||
|     assert_equal([0x10000], [0x10000].pack("U").unpack("U")) | ||
|     assert_equal([0x400000], [0x400000].pack("U").unpack("U")) | ||
|     assert_equal([0x10ffff], [0x10ffff].pack("U").unpack("U")) | ||
|     assert_raise(ArgumentError) { "\x80".unpack("U") } | ||
|     assert_raise(ArgumentError) { "\xff".unpack("U") } | ||
|     assert_raise(ArgumentError) { "\xfc\x00".unpack("U") } | ||
|     assert_raise(ArgumentError) { "\xc0\xc0".unpack("U") } | ||
|     assert_raise(ArgumentError) { "\xe0\x80\x80".unpack("U") } | ||
|     assert_raise(RangeError) { "\xf4\x90\x80\x80".unpack("U") } | ||
|   end | ||
|   def test_pack_unpack_u | ||