Project

General

Profile

Actions

Feature #3619

closed

\x{XXXX} as an escape sequence of string

Added by phasis68 (Heesob Park) over 14 years ago. Updated over 12 years ago.

Status:
Rejected
Assignee:
-
Target version:
[ruby-core:31512]

Description

=begin
I noticed String#inspect results \x{XXXX} for the encoding other than Unicode.

Is there any possibility that \x{XXXX} is accepted as an escape sequence of string?

irb(main):004:0> a = "\xC7\xD1\xB1\xDB"
=> "\xC7ѱ\xDB"
irb(main):005:0> a.encoding
=> #Encoding:UTF-8
irb(main):006:0> a.force_encoding('EUC-KR')
=> "\x{C7D1}\x{B1DB}"
irb(main):007:0> a.encode('UTF-8')
=> "한글"
irb(main):008:0> a
=> "\x{C7D1}\x{B1DB}"
irb(main):009:0> a[0]
=> "\x{C7D1}"
irb(main):010:0> a[1]
=> "\x{B1DB}"
irb(main):011:0> b = "\x{B1DB}"
SyntaxError: (irb):11: invalid hex escape
b = "\x{B1DB}"
^
from /usr/local/bin/irb:12:in `'
=end

Actions #1

Updated by nobu (Nobuyoshi Nakada) over 14 years ago

=begin
Hi,

At Tue, 27 Jul 2010 22:21:31 +0900,
Heesob Park wrote in [ruby-core:31512]:

I noticed String#inspect results \x{XXXX} for the encoding other than Unicode.

Is there any possibility that \x{XXXX} is accepted as an escape sequence of string?

irb(main):004:0> a = "\xC7\xD1\xB1\xDB"

This is in binary representation.

irb(main):010:0> a[1]
=> "\x{B1DB}"

But this is in codepoint representation.

I'm afraid it may confuse users.


diff --git a/parse.y b/parse.y
index ba52135..ec13fb6 100644
--- a/parse.y
+++ b/parse.y
@@ -5456,8 +5456,8 @@ parser_tok_hex(struct parser_params *parser, size_t *numlen)
#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))

static int
-parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,

  •               int string_literal, int symbol_literal, int regexp_literal)
    

+parser_tokadd_multibyte(struct parser_params *parser, rb_encoding **encp, int enctype,

  •  	int string_literal, int symbol_literal, int regexp_literal)
    

{
/*
* If string_literal is true, then we allow multiple codepoints
@@ -5466,22 +5466,28 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
* codepoint without adding it
*/

  • int codepoint;
  • size_t numlen;
  • int codepoint, unicode_p = enctype == 'u', mblen;
  • size_t numlen, maxlen;
  • char errmsg[64];
  • const char *encname = unicode_p ? "Unicode" : (*encp)->name;
  • if (regexp_literal) { tokadd('\'); tokadd('u'); }
  • if (regexp_literal) { tokadd('\'); tokadd(enctype); }

    if (peek('{')) { /* handle \u{...} form */

  • maxlen = unicode_p ? 6 : 4;
    do {
    if (regexp_literal) { tokadd(*lex_p); }
    nextc();

  •  codepoint = scan_hex(lex_p, 6, &numlen);
    
  •  codepoint = scan_hex(lex_p, maxlen, &numlen);
     if (numlen == 0)  {
    
  •  yyerror("invalid Unicode escape");
    
  •  snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname);
    
  •  yyerror(errmsg);
     return 0;
     }
    
  •  if (codepoint > 0x10ffff) {
    
  •  yyerror("invalid Unicode codepoint (too large)");
    
  •  mblen = ONIGENC_CODE_TO_MBCLEN(unicode_p ? UTF8_ENC() : *encp, codepoint);
    
  •  if (!MBCLEN_CHARFOUND_P(mblen)) {
    
  •  snprintf(errmsg, sizeof(errmsg), "invalid %s codepoint", encname);
    
  •  yyerror(errmsg);
     return 0;
     }
     lex_p += numlen;
    

@@ -5489,7 +5495,7 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
tokcopy((int)numlen);
}
else if (codepoint >= 0x80) {

  •  *encp = UTF8_ENC();
    
  •  if (unicode_p) *encp = UTF8_ENC();
     if (string_literal) tokaddmbc(codepoint, *encp);
     }
     else if (string_literal) {
    

@@ -5506,16 +5512,18 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding *encp,
nextc();
}
else { /
handle \uxxxx form */

  • codepoint = scan_hex(lex_p, 4, &numlen);
  • if (numlen < 4) {
  •  yyerror("invalid Unicode escape");
    
  • maxlen = unicode_p ? 4 : 2;
  • codepoint = scan_hex(lex_p, maxlen, &numlen);
  • if (numlen < maxlen) {
  •  snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname);
    
  •  yyerror(errmsg);
     return 0;
    
    }
  • lex_p += 4;
  • lex_p += numlen;
    if (regexp_literal) {
  •        tokcopy(4);
    
  •        tokcopy(numlen);
       }
    
  • else if (codepoint >= 0x80) {
  • else if (codepoint >= 0x80 && unicode_p) {
    *encp = UTF8_ENC();
    if (string_literal) tokaddmbc(codepoint, *encp);
    }
    @@ -5570,6 +5578,9 @@ parser_read_escape(struct parser_params *parser, int flags,
    return c;

    case 'x':	/* hex constant */
    
  • if (peek('{')) {

  • }
    c = tok_hex(&numlen);
    if (numlen == 0) return 0;
    return c;
    @@ -5825,13 +5836,14 @@ parser_tokadd_string(struct parser_params *parser,
    break;

      case 'u':
    
  •    case 'x':
     if ((func & STR_FUNC_EXPAND) == 0) {
         tokadd('\\');
         break;
     }
    
  •  parser_tokadd_utf8(parser, &enc, 1,
    
  •  		   func & STR_FUNC_SYMBOL,
    
  •                               func & STR_FUNC_REGEXP);
    
  •  parser_tokadd_multibyte(parser, &enc, c, 1,
    
  •  			func & STR_FUNC_SYMBOL,
    
  •  			func & STR_FUNC_REGEXP);
     if (has_nonascii && enc != *encp) {
         mixed_escape(beg, enc, *encp);
     }
    

@@ -6855,9 +6867,9 @@ parser_yylex(struct parser_params *parser)
goto ternary;
}
else if (c == '\') {

  •        if (peek('u')) {
    
  •            nextc();
    
  •            c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
    
  •  c = nextc();
    
  •        if (c == 'u' || c == 'x') {
    
  •            c = parser_tokadd_multibyte(parser, &enc, c, 0, 0, 0);
               if (0x80 <= c) {
                   tokaddmbc(c, enc);
               }
    

@@ -6866,6 +6878,7 @@ parser_yylex(struct parser_params *parser)
}
}
else {

  •  pushback(c);
               c = read_escape(0, &enc);
               tokadd(c);
           }
    

diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index 07cda75..28996ab 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -159,6 +159,9 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("EUC-JP", eval(e(%{"\x20"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\n"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\x80"})).encoding)

  • str = eval(e(%{"\x{a1a1}"}))
  • assert_encoding("EUC-JP", str.encoding)
  • assert_equal(0xa1a1, str.ord)
    end
def test_utf8_literal

--
Nobu Nakada

=end

Actions #2

Updated by naruse (Yui NARUSE) over 14 years ago

=begin
I originally intended to prevent to use String#inspect as String#dump.
(of course the main intention is to show its codepoint)

Anyway I'm still wandering because it causes confusing about the codepoint's encoding.
=end

Actions #3

Updated by shyouhei (Shyouhei Urabe) over 14 years ago

=begin

Anyway I'm still wandering because it causes confusing about the codepoint's encoding.

Any possibilities other than ENCODING ?
=end

Updated by naruse (Yui NARUSE) over 12 years ago

  • Description updated (diff)
  • Status changed from Open to Rejected
Actions

Also available in: Atom PDF

Like0
Like0Like0Like0Like0