Index: lib/csv.rb
===================================================================
--- lib/csv.rb	(revision 29650)
+++ lib/csv.rb	(working copy)
@@ -154,11 +154,11 @@
 #   CSV(csv = "")   { |csv_str| csv_str << %w{my data here} }  # to a String
 #   CSV($stderr)    { |csv_err| csv_err << %w{my data here} }  # to $stderr
 #   CSV($stdin)     { |csv_in|  csv_in.each { |row| p row } }  # from $stdin
-# 
+#
 # == Advanced Usage
-# 
+#
 # === Wrap an IO Object
-# 
+#
 #   csv = CSV.new(io, options)
 #   # ... read (with gets() or each()) from and write (with <<) to csv here ...
 #
@@ -836,7 +836,7 @@
     #
     # This method assumes you want the Table.headers(), unless you explicitly
     # pass <tt>:write_headers => false</tt>.
-    # 
+    #
     def to_csv(options = Hash.new)
       wh = options.fetch(:write_headers, true)
       @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
@@ -965,6 +965,7 @@
   # <b><tt>:row_sep</tt></b>::            <tt>:auto</tt>
   # <b><tt>:quote_char</tt></b>::         <tt>'"'</tt>
   # <b><tt>:field_size_limit</tt></b>::   +nil+
+  # <b><tt>:io_read_limit</tt></b>::      <tt>2048</tt>
   # <b><tt>:converters</tt></b>::         +nil+
   # <b><tt>:unconverted_fields</tt></b>:: +nil+
   # <b><tt>:headers</tt></b>::            +false+
@@ -977,6 +978,7 @@
                       row_sep:            :auto,
                       quote_char:         '"',
                       field_size_limit:   nil,
+                      io_read_limit:      2048,
                       converters:         nil,
                       unconverted_fields: nil,
                       headers:            false,
@@ -1586,6 +1588,7 @@
 
     # track our own lineno since IO gets confused about line-ends is CSV fields
     @lineno = 0
+    @data_buf = nil
   end
 
   #
@@ -1681,6 +1684,7 @@
   def rewind
     @headers = nil
     @lineno  = 0
+    @data_buf = nil
 
     @io.rewind
   end
@@ -1798,11 +1802,6 @@
   # The data source must be open for reading.
   #
   def shift
-    #########################################################################
-    ### This method is purposefully kept a bit long as simple conditional ###
-    ### checks are faster than numerous (expensive) method calls.         ###
-    #########################################################################
-
     # handle headers not based on document content
     if header_row? and @return_headers and
        [Array, String].include? @use_headers.class
@@ -1813,124 +1812,37 @@
       end
     end
 
-    #
-    # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
-    # because of \r and/or \n characters embedded in quoted fields
-    #
-    in_extended_col = false
-    csv             = Array.new
+    @lineno += 1
+    csv = parse_csv_row
+    return unless csv
 
-    loop do
-      # add another read to the line
-      unless parse = @io.gets(@row_sep)
-        return nil
+    if csv == [nil]
+      if @skip_blanks
+        return shift
+      elsif @unconverted_fields
+        return add_unconverted_fields(Array.new, Array.new)
+      elsif @use_headers
+        return self.class::Row.new(Array.new, Array.new)
+      else
+        return Array.new
       end
+    end
 
-      parse.sub!(@parsers[:line_end], "")
+    # save fields unconverted fields, if needed...
+    unconverted = csv.dup if @unconverted_fields
 
-      if csv.empty?
-        #
-        # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
-        # CSV's <tt>[nil]</tt>
-        #
-        if parse.empty?
-          @lineno += 1
-          if @skip_blanks
-            next
-          elsif @unconverted_fields
-            return add_unconverted_fields(Array.new, Array.new)
-          elsif @use_headers
-            return self.class::Row.new(Array.new, Array.new)
-          else
-            return Array.new
-          end
-        end
-      end
+    # convert fields, if needed...
+    csv = convert_fields(csv) unless @use_headers or @converters.empty?
+    # parse out header rows and handle CSV::Row conversions...
+    csv = parse_headers(csv)  if     @use_headers
 
-      parts =  parse.split(@col_sep, -1)
-      if parts.empty?
-        if in_extended_col
-          csv[-1] << @col_sep   # will be replaced with a @row_sep after the parts.each loop
-        else
-          csv << nil
-        end
-      end
+    # inject unconverted fields and accessor, if requested...
+    if @unconverted_fields and not csv.respond_to? :unconverted_fields
+      add_unconverted_fields(csv, unconverted)
+    end
 
-      # This loop is the hot path of csv parsing. Some things may be non-dry
-      # for a reason. Make sure to benchmark when refactoring.
-      parts.each do |part|
-        if in_extended_col
-          # If we are continuing a previous column
-          if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
-            # extended column ends
-            csv.last << part[0..-2]
-            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
-            csv.last.gsub!(@quote_char * 2, @quote_char)
-            in_extended_col = false
-          else
-            csv.last << part
-            csv.last << @col_sep
-          end
-        elsif part[0] == @quote_char
-          # If we are staring a new quoted column
-          if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
-            # start an extended column
-            csv             << part[1..-1]
-            csv.last        << @col_sep
-            in_extended_col =  true
-          else
-            # regular quoted column
-            csv << part[1..-2]
-            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
-            csv.last.gsub!(@quote_char * 2, @quote_char)
-          end
-        elsif part =~ @parsers[:quote_or_nl]
-          # Unquoted field with bad characters.
-          if part =~ @parsers[:nl_or_lf]
-            raise MalformedCSVError, "Unquoted fields do not allow " +
-                                     "\\r or \\n (line #{lineno + 1})."
-          else
-            raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
-          end
-        else
-          # Regular ole unquoted field.
-          csv << (part.empty? ? nil : part)
-        end
-      end
-
-      # Replace tacked on @col_sep with @row_sep if we are still in an extended
-      # column.
-      csv[-1][-1] = @row_sep if in_extended_col
-
-      if in_extended_col
-        # if we're at eof?(), a quoted field wasn't closed...
-        if @io.eof?
-          raise MalformedCSVError,
-                "Unclosed quoted field on line #{lineno + 1}."
-        elsif @field_size_limit and csv.last.size >= @field_size_limit
-          raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
-        end
-        # otherwise, we need to loop and pull some more data to complete the row
-      else
-        @lineno += 1
-
-        # save fields unconverted fields, if needed...
-        unconverted = csv.dup if @unconverted_fields
-
-        # convert fields, if needed...
-        csv = convert_fields(csv) unless @use_headers or @converters.empty?
-        # parse out header rows and handle CSV::Row conversions...
-        csv = parse_headers(csv)  if     @use_headers
-
-        # inject unconverted fields and accessor, if requested...
-        if @unconverted_fields and not csv.respond_to? :unconverted_fields
-          add_unconverted_fields(csv, unconverted)
-        end
-
-        # return the results
-        break csv
-      end
-    end
+    # return the results
+    csv
   end
   alias_method :gets,     :shift
   alias_method :readline, :shift
@@ -1976,7 +1888,122 @@
 
   private
 
+  def parse_csv_row
+    buf = io_get_unquoted
+    return unless buf
+    line = []
+
+    loop do
+      case buf
+      when @quote_char
+        line << io_get_quoted
+        buf = io_get_unquoted
+
+        case buf
+        when nil, @row_sep
+          return line
+        when @col_sep
+          return line << nil
+        end
+
+        break unless buf.slice!(0, @col_sep.size) == @col_sep
+      when @row_sep
+        return line << nil
+      else
+        newline = buf.chomp! @row_sep
+
+        if buf.count(@nl_lf) > 0
+          raise MalformedCSVError, "Unquoted fields do not allow " +
+                                   "\\r or \\n (line #{@lineno})."
+        end
+
+        buf.split(@col_sep, -1).each{ |c| line << (c.empty? ? nil : c) }
+
+        if newline
+          return line
+        elsif line.last == @quote_char
+          buf = line.pop
+        elsif data_buf_eof?
+          return line
+        else
+          break
+        end
+      end
+    end
+
+    raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
+  end
+
+  # Read @io and return everything until the next unescaped quote character.
+  # Escaped quote characters are automatically unescaped. Raises an error if we
+  # hit @io.eof? or @field_size_limit without encountering an ending quote.
   #
+  # Only successfully returns if the read ended with a @quote_char. This
+  # prevents us from returning only part of a multibyte character.
+  def io_get_quoted
+    buf = @io.gets @quote_char, @field_size_limit
+
+    while buf.chomp!(@quote_char) do
+      @data_buf = @io.gets @quote_char, @io_read_limit
+      return buf unless @data_buf == @quote_char
+
+      break if @io.eof?
+      buf << @quote_char + @io.gets(@quote_char, @field_size_limit)
+    end
+
+    raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
+  end
+
+  # Read @io and return everything until we hit a newline or a quote character.
+  # Raise an error if we exceed @field_size_limit.
+  #
+  # Only successfully returns if the read contains @row_sep, ends with a
+  # @quote_char, or reaches @io.eof? This prevents us from returning only part
+  # of a multibyte character.
+  #
+  # If we have multibyte encoding then it is possible that a @quote_char will be
+  # truncated on @io.gets. We only check for this if we exhaust the data buffer.
+  def io_get_unquoted
+    unless @data_buf
+      return unless @data_buf = @io.gets(@quote_char, @io_read_limit)
+    end
+
+    loop do
+      if newline = @data_buf.index(@row_sep)
+        break if newline == @data_buf.size - @row_sep.size
+        return @data_buf.slice!(0, newline + @row_sep.size)
+      end
+
+      break if @io.eof? || @data_buf.end_with?(@quote_char)
+
+      if @field_size_limit && @data_buf.size > @field_size_limit
+        raise MalformedCSVError, "Field size exceeded on line #{@lineno}."
+      end
+
+      @data_buf += @io.gets(@quote_char, @io_read_limit) unless align_data_buf
+    end
+
+    return_buf = @data_buf
+    @data_buf = nil
+    return_buf
+  end
+
+  # Fetch up to 10 bytes into @data_buf until the string matches its encoding.
+  # Returns a value only if we modified the buffer.
+  def align_data_buf
+    return if @data_buf.valid_encoding? || @io.eof?
+
+    10.times do
+      break true if @data_buf.valid_encoding? || @io.eof?
+      @data_buf += @io.read(1).force_encoding(raw_encoding)
+    end
+  end
+
+  def data_buf_eof?
+    @io.eof? && !@data_buf
+  end
+
+  #
   # Stores the indicated separators for later use.
   #
   # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
@@ -2075,21 +2102,8 @@
     # store the parser behaviors
     @skip_blanks      = options.delete(:skip_blanks)
     @field_size_limit = options.delete(:field_size_limit)
-
-    # prebuild Regexps for faster parsing
-    esc_row_sep = escape_re(@row_sep)
-    esc_quote   = escape_re(@quote_char)
-    @parsers = {
-      # for detecting parse errors
-      quote_or_nl:    encode_re("[", esc_quote, "\r\n]"),
-      nl_or_lf:       encode_re("[\r\n]"),
-      stray_quote:    encode_re( "[^", esc_quote, "]", esc_quote,
-                                 "[^", esc_quote, "]" ),
-      # safer than chomp!()
-      line_end:       encode_re(esc_row_sep, "\\z"),
-      # illegal unquoted characters
-      return_newline: encode_str("\r\n")
-    }
+    @io_read_limit    = options.delete(:io_read_limit)
+    @nl_lf            = encode_str("\r\n")
   end
 
   #