From 2435a6b1fe5acdaf2b830f1037949160e627ca09 Mon Sep 17 00:00:00 2001
From: Nobuyoshi Nakada <nobu@ruby-lang.org>
Date: Sat, 25 Apr 2015 13:39:17 +0900
Subject: [PATCH] enc/utf_8.c, pack.c: limit UTF-8

* enc/utf_8.c (code_to_mbclen, code_to_mbc): reject values larger
  than UTF-8 max codepoints.  [Feature #11094]

* pack.c (rb_uv_to_utf8, utf8_to_uv): ditto.
---
 enc/utf_8.c                    | 21 +++------------------
 pack.c                         | 27 ++++-----------------------
 test/ruby/test_integer_comb.rb |  2 +-
 test/ruby/test_pack.rb         | 10 ++++++----
 4 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/enc/utf_8.c b/enc/utf_8.c
index b8f38e9..8ab8792 100644
--- a/enc/utf_8.c
+++ b/enc/utf_8.c
@@ -35,8 +35,8 @@
 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
 #define INVALID_CODE_FE   0xfffffffe
 #define INVALID_CODE_FF   0xffffffff
-#define VALID_CODE_LIMIT  0x7fffffff
 #endif
+#define VALID_CODE_LIMIT  0x0010ffff
 
 #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
 
@@ -297,9 +297,7 @@ code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
   if      ((code & 0xffffff80) == 0) return 1;
   else if ((code & 0xfffff800) == 0) return 2;
   else if ((code & 0xffff0000) == 0) return 3;
-  else if ((code & 0xffe00000) == 0) return 4;
-  else if ((code & 0xfc000000) == 0) return 5;
-  else if ((code & 0x80000000) == 0) return 6;
+  else if (code <= VALID_CODE_LIMIT) return 4;
 #ifdef USE_INVALID_CODE_SCHEME
   else if (code == INVALID_CODE_FE) return 1;
   else if (code == INVALID_CODE_FF) return 1;
@@ -328,24 +326,11 @@ code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
       *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
       *p++ = UTF8_TRAILS(code, 6);
     }
-    else if ((code & 0xffe00000) == 0) {
+    else if (code <= VALID_CODE_LIMIT) {
       *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
       *p++ = UTF8_TRAILS(code, 12);
       *p++ = UTF8_TRAILS(code,  6);
     }
-    else if ((code & 0xfc000000) == 0) {
-      *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
-      *p++ = UTF8_TRAILS(code, 18);
-      *p++ = UTF8_TRAILS(code, 12);
-      *p++ = UTF8_TRAILS(code,  6);
-    }
-    else if ((code & 0x80000000) == 0) {
-      *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
-      *p++ = UTF8_TRAILS(code, 24);
-      *p++ = UTF8_TRAILS(code, 18);
-      *p++ = UTF8_TRAILS(code, 12);
-      *p++ = UTF8_TRAILS(code,  6);
-    }
 #ifdef USE_INVALID_CODE_SCHEME
     else if (code == INVALID_CODE_FE) {
       *p = 0xfe;
diff --git a/pack.c b/pack.c
index 73974b7..e7fac81 100644
--- a/pack.c
+++ b/pack.c
@@ -1919,30 +1919,13 @@ rb_uv_to_utf8(char buf[6], unsigned long uv)
 	buf[2] = castchar((uv&0x3f)|0x80);
 	return 3;
     }
-    if (uv <= 0x1fffff) {
+    if (uv <= 0x10ffff) {
 	buf[0] = castchar(((uv>>18)&0xff)|0xf0);
 	buf[1] = castchar(((uv>>12)&0x3f)|0x80);
 	buf[2] = castchar(((uv>>6)&0x3f)|0x80);
 	buf[3] = castchar((uv&0x3f)|0x80);
 	return 4;
     }
-    if (uv <= 0x3ffffff) {
-	buf[0] = castchar(((uv>>24)&0xff)|0xf8);
-	buf[1] = castchar(((uv>>18)&0x3f)|0x80);
-	buf[2] = castchar(((uv>>12)&0x3f)|0x80);
-	buf[3] = castchar(((uv>>6)&0x3f)|0x80);
-	buf[4] = castchar((uv&0x3f)|0x80);
-	return 5;
-    }
-    if (uv <= 0x7fffffff) {
-	buf[0] = castchar(((uv>>30)&0xff)|0xfc);
-	buf[1] = castchar(((uv>>24)&0x3f)|0x80);
-	buf[2] = castchar(((uv>>18)&0x3f)|0x80);
-	buf[3] = castchar(((uv>>12)&0x3f)|0x80);
-	buf[4] = castchar(((uv>>6)&0x3f)|0x80);
-	buf[5] = castchar((uv&0x3f)|0x80);
-	return 6;
-    }
     rb_raise(rb_eRangeError, "pack(U): value out of range");
 
     UNREACHABLE;
@@ -1953,9 +1936,6 @@ static const unsigned long utf8_limits[] = {
     0x80,			/* 2 */
     0x800,			/* 3 */
     0x10000,			/* 4 */
-    0x200000,			/* 5 */
-    0x4000000,			/* 6 */
-    0x80000000,			/* 7 */
 };
 
 static unsigned long
@@ -1977,8 +1957,6 @@ utf8_to_uv(const char *p, long *lenp)
     if      (!(uv & 0x20)) { n = 2; uv &= 0x1f; }
     else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; }
     else if (!(uv & 0x08)) { n = 4; uv &= 0x07; }
-    else if (!(uv & 0x04)) { n = 5; uv &= 0x03; }
-    else if (!(uv & 0x02)) { n = 6; uv &= 0x01; }
     else {
 	*lenp = 1;
 	rb_raise(rb_eArgError, "malformed UTF-8 character");
@@ -2005,6 +1983,9 @@ utf8_to_uv(const char *p, long *lenp)
     if (uv < utf8_limits[n]) {
 	rb_raise(rb_eArgError, "redundant UTF-8 sequence");
     }
+    else if (uv > 0x10ffff) {
+	rb_raise(rb_eRangeError, "unpack(U): value out of range");
+    }
     return uv;
 }
 
diff --git a/test/ruby/test_integer_comb.rb b/test/ruby/test_integer_comb.rb
index 9018518..1c732b4 100644
--- a/test/ruby/test_integer_comb.rb
+++ b/test/ruby/test_integer_comb.rb
@@ -619,7 +619,7 @@ class TestIntegerComb < Test::Unit::TestCase
   def test_pack_utf8
     template = "U"
     VS.reverse_each {|a|
-      if a < 0 || 0x7fffffff < a
+      if a < 0 || 0x10ffff < a
         assert_raise(RangeError) { [a].pack(template) }
       else
         s = [a].pack(template)
diff --git a/test/ruby/test_pack.rb b/test/ruby/test_pack.rb
index 5d2b656..ace4f48 100644
--- a/test/ruby/test_pack.rb
+++ b/test/ruby/test_pack.rb
@@ -160,9 +160,10 @@ class TestPack < Test::Unit::TestCase
     assert_raise(RangeError) { [-0x40000000].pack("U") }
     assert_raise(RangeError) { [-1].pack("U") }
     assert_equal "\000", [0].pack("U")
-    assert_equal "\374\277\277\277\277\277".force_encoding(Encoding::UTF_8), [0x3fffffff].pack("U")
-    assert_equal "\375\200\200\200\200\200".force_encoding(Encoding::UTF_8), [0x40000000].pack("U")
-    assert_equal "\375\277\277\277\277\277".force_encoding(Encoding::UTF_8), [0x7fffffff].pack("U")
+    assert_equal "\364\217\277\277".force_encoding(Encoding::UTF_8), [0x10ffff].pack("U")
+    assert_raise(RangeError) { [0x3fffffff].pack("U") }
+    assert_raise(RangeError) { [0x40000000].pack("U") }
+    assert_raise(RangeError) { [0x110000].pack("U") }
     assert_raise(RangeError) { [0x80000000].pack("U") }
     assert_raise(RangeError) { [0x100000000].pack("U") }
   end
@@ -492,13 +493,14 @@ class TestPack < Test::Unit::TestCase
     assert_equal([0x80], [0x80].pack("U").unpack("U"))
     assert_equal([0x800], [0x800].pack("U").unpack("U"))
     assert_equal([0x10000], [0x10000].pack("U").unpack("U"))
-    assert_equal([0x400000], [0x400000].pack("U").unpack("U"))
+    assert_equal([0x10ffff], [0x10ffff].pack("U").unpack("U"))
 
     assert_raise(ArgumentError) { "\x80".unpack("U") }
     assert_raise(ArgumentError) { "\xff".unpack("U") }
     assert_raise(ArgumentError) { "\xfc\x00".unpack("U") }
     assert_raise(ArgumentError) { "\xc0\xc0".unpack("U") }
     assert_raise(ArgumentError) { "\xe0\x80\x80".unpack("U") }
+    assert_raise(RangeError) { "\xf4\x90\x80\x80".unpack("U") }
   end
 
   def test_pack_unpack_u
-- 
2.3.6