Project

General

Profile

Bug #15908 ยป 0001-Enable-BOM-detection-with-non-UTF-encodings.patch

nobu (Nobuyoshi Nakada), 06/08/2019 12:43 PM

View differences:

io.c
5454 5454
}
5455 5455

  
5456 5456
static const char bom_prefix[] = "bom|";
5457
static const char utf_prefix[] = "utf-";
5458 5457
enum {bom_prefix_len = (int)sizeof(bom_prefix) - 1};
5459
enum {utf_prefix_len = (int)sizeof(utf_prefix) - 1};
5460 5458

  
5461 5459
static int
5462 5460
io_encname_bom_p(const char *name, long len)
......
5693 5691
    if ((fmode & FMODE_SETENC_BY_BOM) || io_encname_bom_p(estr, len)) {
5694 5692
	estr += bom_prefix_len;
5695 5693
	len -= bom_prefix_len;
5696
	if (!STRNCASECMP(estr, utf_prefix, utf_prefix_len)) {
5697
	    fmode |= FMODE_SETENC_BY_BOM;
5698
	}
5699
	else {
5700
	    rb_enc_warn(estr_enc, "BOM with non-UTF encoding %s is nonsense", estr);
5701
	    fmode &= ~FMODE_SETENC_BY_BOM;
5702
	}
5694
        fmode |= FMODE_SETENC_BY_BOM;
5703 5695
    }
5704 5696
    if (len == 0 || len > ENCODING_MAXNAMELEN) {
5705 5697
	idx = -1;
test/ruby/test_io_m17n.rb
2084 2084
    with_tmpdir {
2085 2085
      text = "\uFEFFa"
2086 2086
      stripped = "a"
2087
      %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name|
2087
      %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.product(%w[UTF-8 CP932]) do |name, enc|
2088 2088
        path = '%s-bom.txt' % name
2089 2089
        content = text.encode(name)
2090 2090
        generate_file(path, content)
2091
        result = File.read(path, mode: 'rb:BOM|UTF-8')
2092
        assert_equal(content[1].force_encoding("ascii-8bit"),
2093
                     result.force_encoding("ascii-8bit"))
2094
        result = File.read(path, mode: 'rb:BOM|UTF-8:UTF-8')
2095
        assert_equal(Encoding::UTF_8, result.encoding)
2096
        assert_equal(stripped, result)
2091
        result = File.read(path, mode: "rb:BOM|#{enc}")
2092
        assert_equal(Encoding.find(name), result.encoding, name)
2093
        assert_equal(stripped.encode(name), result, name)
2094
        result = File.read(path, mode: "rb:BOM|#{enc}:UTF-8")
2095
        assert_equal(Encoding::UTF_8, result.encoding, name)
2096
        assert_equal(stripped, result, name)
2097 2097
      end
2098 2098

  
2099 2099
      bug3407 = '[ruby-core:30641]'
2100 2100
      path = 'UTF-8-bom.txt'
2101 2101
      result = File.read(path, encoding: 'BOM|UTF-8')
2102
      assert_equal("a", result.force_encoding("ascii-8bit"), bug3407)
2102
      assert_equal("a", result, bug3407)
2103 2103

  
2104 2104
      bug8323 = '[ruby-core:54563] [Bug #8323]'
2105 2105
      expected = "a\xff".force_encoding("utf-8")
2106 2106
      open(path, 'ab') {|f| f.write("\xff")}
2107
      result = File.read(path, encoding: 'BOM|UTF-8')
2107
      result = File.read(path, encoding: 'BOM|CP932')
2108 2108
      assert_not_predicate(result, :valid_encoding?, bug8323)
2109 2109
      assert_equal(expected, result, bug8323)
2110
      result = File.read(path, encoding: 'BOM|UTF-8:UTF-8')
2110
      result = File.read(path, encoding: 'BOM|CP932:UTF-8')
2111 2111
      assert_not_predicate(result, :valid_encoding?, bug8323)
2112 2112
      assert_equal(expected, result, bug8323)
2113 2113

  
......
2138 2138
  def test_bom_non_utf
2139 2139
    enc = nil
2140 2140

  
2141
    assert_warn(/BOM/) {
2141
    assert_warn('') {
2142 2142
      open(__FILE__, "r:bom|us-ascii") {|f| enc = f.external_encoding}
2143 2143
    }
2144 2144
    assert_equal(Encoding::US_ASCII, enc)
2145 2145

  
2146 2146
    enc = nil
2147
    assert_warn(/BOM/) {
2147
    assert_warn('') {
2148 2148
      open(__FILE__, "r", encoding: "bom|us-ascii") {|f| enc = f.external_encoding}
2149 2149
    }
2150 2150
    assert_equal(Encoding::US_ASCII, enc)
2151 2151

  
2152 2152
    enc = nil
2153
    assert_warn(/BOM/) {
2153
    assert_warn('') {
2154 2154
      open(IO::NULL, "w:bom|us-ascii") {|f| enc = f.external_encoding}
2155 2155
    }
2156 2156
    assert_equal(Encoding::US_ASCII, enc)
2157 2157

  
2158 2158
    enc = nil
2159
    assert_warn(/BOM/) {
2159
    assert_warn('') {
2160 2160
      open(IO::NULL, "w", encoding: "bom|us-ascii") {|f| enc = f.external_encoding}
2161 2161
    }
2162 2162
    assert_equal(Encoding::US_ASCII, enc)
2163
-