=begin
Hi,
At Tue, 27 Jul 2010 22:21:31 +0900,
Heesob Park wrote in [ruby-core:31512]:
I noticed String#inspect results \x{XXXX} for the encoding other than Unicode.
Is there any possibility that \x{XXXX} is accepted as an escape sequence of string?
irb(main):004:0> a = "\xC7\xD1\xB1\xDB"
This is in binary representation.
irb(main):010:0> a[1]
=> "\x{B1DB}"
But this is in codepoint representation.
I'm afraid it may confuse users.
diff --git a/parse.y b/parse.y
index ba52135..ec13fb6 100644
--- a/parse.y
+++ b/parse.y
@@ -5456,8 +5456,8 @@ parser_tok_hex(struct parser_params *parser, size_t *numlen)
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
static int
-parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
+parser_tokadd_multibyte(struct parser_params *parser, rb_encoding **encp, int enctype,
{
/*
* If string_literal is true, then we allow multiple codepoints
@@ -5466,22 +5466,28 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
* codepoint without adding it
*/
- int codepoint;
- size_t numlen;
- int codepoint, unicode_p = enctype == 'u', mblen;
- size_t numlen, maxlen;
- char errmsg[64];
- const char *encname = unicode_p ? "Unicode" : (*encp)->name;
- if (regexp_literal) { tokadd('\'); tokadd('u'); }
-
if (regexp_literal) { tokadd('\'); tokadd(enctype); }
if (peek('{')) { /* handle \u{...} form */
-
maxlen = unicode_p ? 6 : 4;
do {
if (regexp_literal) { tokadd(*lex_p); }
nextc();
-
snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname);
-
yyerror(errmsg);
return 0;
}
-
mblen = ONIGENC_CODE_TO_MBCLEN(unicode_p ? UTF8_ENC() : *encp, codepoint);
-
if (!MBCLEN_CHARFOUND_P(mblen)) {
-
snprintf(errmsg, sizeof(errmsg), "invalid %s codepoint", encname);
-
yyerror(errmsg);
return 0;
}
lex_p += numlen;
@@ -5489,7 +5495,7 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
tokcopy((int)numlen);
}
else if (codepoint >= 0x80) {
@@ -5506,16 +5512,18 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding *encp,
nextc();
}
else { / handle \uxxxx form */
- maxlen = unicode_p ? 4 : 2;
- codepoint = scan_hex(lex_p, maxlen, &numlen);
- if (numlen < maxlen) {
-
snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname);
-
yyerror(errmsg);
return 0;
}
- lex_p += numlen;
if (regexp_literal) {
- else if (codepoint >= 0x80) {
-
else if (codepoint >= 0x80 && unicode_p) {
*encp = UTF8_ENC();
if (string_literal) tokaddmbc(codepoint, *encp);
}
@@ -5570,6 +5578,9 @@ parser_read_escape(struct parser_params *parser, int flags,
return c;
case 'x': /* hex constant */
-
if (peek('{')) {
-
-
}
c = tok_hex(&numlen);
if (numlen == 0) return 0;
return c;
@@ -5825,13 +5836,14 @@ parser_tokadd_string(struct parser_params *parser,
break;
case 'u':
-
case 'x':
if ((func & STR_FUNC_EXPAND) == 0) {
tokadd('\\');
break;
}
-
parser_tokadd_multibyte(parser, &enc, c, 1,
-
func & STR_FUNC_SYMBOL,
-
func & STR_FUNC_REGEXP);
if (has_nonascii && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
@@ -6855,9 +6867,9 @@ parser_yylex(struct parser_params *parser)
goto ternary;
}
else if (c == '\') {
-
if (peek('u')) {
-
nextc();
-
c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
-
c = nextc();
-
if (c == 'u' || c == 'x') {
-
c = parser_tokadd_multibyte(parser, &enc, c, 0, 0, 0);
if (0x80 <= c) {
tokaddmbc(c, enc);
}
@@ -6866,6 +6878,7 @@ parser_yylex(struct parser_params *parser)
}
}
else {
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index 07cda75..28996ab 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -159,6 +159,9 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("EUC-JP", eval(e(%{"\x20"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\n"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\x80"})).encoding)
- str = eval(e(%{"\x{a1a1}"}))
- assert_encoding("EUC-JP", str.encoding)
- assert_equal(0xa1a1, str.ord)
end
def test_utf8_literal
--
Nobu Nakada
=end