Index: lib/csv.rb =================================================================== --- lib/csv.rb (revision 29650) +++ lib/csv.rb (working copy) @@ -154,11 +154,11 @@ # CSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String # CSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr # CSV($stdin) { |csv_in| csv_in.each { |row| p row } } # from $stdin -# +# # == Advanced Usage -# +# # === Wrap an IO Object -# +# # csv = CSV.new(io, options) # # ... read (with gets() or each()) from and write (with <<) to csv here ... # @@ -836,7 +836,7 @@ # # This method assumes you want the Table.headers(), unless you explicitly # pass :write_headers => false. - # + # def to_csv(options = Hash.new) wh = options.fetch(:write_headers, true) @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row| @@ -965,6 +965,7 @@ # :row_sep:: :auto # :quote_char:: '"' # :field_size_limit:: +nil+ + # :io_read_limit:: 2048 # :converters:: +nil+ # :unconverted_fields:: +nil+ # :headers:: +false+ @@ -977,6 +978,7 @@ row_sep: :auto, quote_char: '"', field_size_limit: nil, + io_read_limit: 2048, converters: nil, unconverted_fields: nil, headers: false, @@ -1586,6 +1588,7 @@ # track our own lineno since IO gets confused about line-ends is CSV fields @lineno = 0 + @data_buf = nil end # @@ -1681,6 +1684,7 @@ def rewind @headers = nil @lineno = 0 + @data_buf = nil @io.rewind end @@ -1798,11 +1802,6 @@ # The data source must be open for reading. # def shift - ######################################################################### - ### This method is purposefully kept a bit long as simple conditional ### - ### checks are faster than numerous (expensive) method calls. ### - ######################################################################### - # handle headers not based on document content if header_row? and @return_headers and [Array, String].include? @use_headers.class @@ -1813,124 +1812,37 @@ end end - # - # it can take multiple calls to @io.gets() to get a full line, - # because of \r and/or \n characters embedded in quoted fields - # - in_extended_col = false - csv = Array.new + @lineno += 1 + csv = parse_csv_row + return unless csv - loop do - # add another read to the line - unless parse = @io.gets(@row_sep) - return nil + if csv == [nil] + if @skip_blanks + return shift + elsif @unconverted_fields + return add_unconverted_fields(Array.new, Array.new) + elsif @use_headers + return self.class::Row.new(Array.new, Array.new) + else + return Array.new end + end - parse.sub!(@parsers[:line_end], "") + # save fields unconverted fields, if needed... + unconverted = csv.dup if @unconverted_fields - if csv.empty? - # - # I believe a blank line should be an Array.new, not Ruby 1.8 - # CSV's [nil] - # - if parse.empty? - @lineno += 1 - if @skip_blanks - next - elsif @unconverted_fields - return add_unconverted_fields(Array.new, Array.new) - elsif @use_headers - return self.class::Row.new(Array.new, Array.new) - else - return Array.new - end - end - end + # convert fields, if needed... + csv = convert_fields(csv) unless @use_headers or @converters.empty? + # parse out header rows and handle CSV::Row conversions... + csv = parse_headers(csv) if @use_headers - parts = parse.split(@col_sep, -1) - if parts.empty? - if in_extended_col - csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop - else - csv << nil - end - end + # inject unconverted fields and accessor, if requested... + if @unconverted_fields and not csv.respond_to? :unconverted_fields + add_unconverted_fields(csv, unconverted) + end - # This loop is the hot path of csv parsing. Some things may be non-dry - # for a reason. Make sure to benchmark when refactoring. - parts.each do |part| - if in_extended_col - # If we are continuing a previous column - if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0 - # extended column ends - csv.last << part[0..-2] - raise MalformedCSVError if csv.last =~ @parsers[:stray_quote] - csv.last.gsub!(@quote_char * 2, @quote_char) - in_extended_col = false - else - csv.last << part - csv.last << @col_sep - end - elsif part[0] == @quote_char - # If we are staring a new quoted column - if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0 - # start an extended column - csv << part[1..-1] - csv.last << @col_sep - in_extended_col = true - else - # regular quoted column - csv << part[1..-2] - raise MalformedCSVError if csv.last =~ @parsers[:stray_quote] - csv.last.gsub!(@quote_char * 2, @quote_char) - end - elsif part =~ @parsers[:quote_or_nl] - # Unquoted field with bad characters. - if part =~ @parsers[:nl_or_lf] - raise MalformedCSVError, "Unquoted fields do not allow " + - "\\r or \\n (line #{lineno + 1})." - else - raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}." - end - else - # Regular ole unquoted field. - csv << (part.empty? ? nil : part) - end - end - - # Replace tacked on @col_sep with @row_sep if we are still in an extended - # column. - csv[-1][-1] = @row_sep if in_extended_col - - if in_extended_col - # if we're at eof?(), a quoted field wasn't closed... - if @io.eof? - raise MalformedCSVError, - "Unclosed quoted field on line #{lineno + 1}." - elsif @field_size_limit and csv.last.size >= @field_size_limit - raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}." - end - # otherwise, we need to loop and pull some more data to complete the row - else - @lineno += 1 - - # save fields unconverted fields, if needed... - unconverted = csv.dup if @unconverted_fields - - # convert fields, if needed... - csv = convert_fields(csv) unless @use_headers or @converters.empty? - # parse out header rows and handle CSV::Row conversions... - csv = parse_headers(csv) if @use_headers - - # inject unconverted fields and accessor, if requested... - if @unconverted_fields and not csv.respond_to? :unconverted_fields - add_unconverted_fields(csv, unconverted) - end - - # return the results - break csv - end - end + # return the results + csv end alias_method :gets, :shift alias_method :readline, :shift @@ -1976,7 +1888,122 @@ private + def parse_csv_row + buf = io_get_unquoted + return unless buf + line = [] + + loop do + case buf + when @quote_char + line << io_get_quoted + buf = io_get_unquoted + + case buf + when nil, @row_sep + return line + when @col_sep + return line << nil + end + + break unless buf.slice!(0, @col_sep.size) == @col_sep + when @row_sep + return line << nil + else + newline = buf.chomp! @row_sep + + if buf.count(@nl_lf) > 0 + raise MalformedCSVError, "Unquoted fields do not allow " + + "\\r or \\n (line #{@lineno})." + end + + buf.split(@col_sep, -1).each{ |c| line << (c.empty? ? nil : c) } + + if newline + return line + elsif line.last == @quote_char + buf = line.pop + elsif data_buf_eof? + return line + else + break + end + end + end + + raise MalformedCSVError, "Illegal quoting on line #{@lineno}." + end + + # Read @io and return everything until the next unescaped quote character. + # Escaped quote characters are automatically unescaped. Raises an error if we + # hit @io.eof? or @field_size_limit without encountering an ending quote. # + # Only successfully returns if the read ended with a @quote_char. This + # prevents us from returning only part of a multibyte character. + def io_get_quoted + buf = @io.gets @quote_char, @field_size_limit + + while buf.chomp!(@quote_char) do + @data_buf = @io.gets @quote_char, @io_read_limit + return buf unless @data_buf == @quote_char + + break if @io.eof? + buf << @quote_char + @io.gets(@quote_char, @field_size_limit) + end + + raise MalformedCSVError, "Illegal quoting on line #{@lineno}." + end + + # Read @io and return everything until we hit a newline or a quote character. + # Raise an error if we exceed @field_size_limit. + # + # Only successfully returns if the read contains @row_sep, ends with a + # @quote_char, or reaches @io.eof? This prevents us from returning only part + # of a multibyte character. + # + # If we have multibyte encoding then it is possible that a @quote_char will be + # truncated on @io.gets. We only check for this if we exhaust the data buffer. + def io_get_unquoted + unless @data_buf + return unless @data_buf = @io.gets(@quote_char, @io_read_limit) + end + + loop do + if newline = @data_buf.index(@row_sep) + break if newline == @data_buf.size - @row_sep.size + return @data_buf.slice!(0, newline + @row_sep.size) + end + + break if @io.eof? || @data_buf.end_with?(@quote_char) + + if @field_size_limit && @data_buf.size > @field_size_limit + raise MalformedCSVError, "Field size exceeded on line #{@lineno}." + end + + @data_buf += @io.gets(@quote_char, @io_read_limit) unless align_data_buf + end + + return_buf = @data_buf + @data_buf = nil + return_buf + end + + # Fetch up to 10 bytes into @data_buf until the string matches its encoding. + # Returns a value only if we modified the buffer. + def align_data_buf + return if @data_buf.valid_encoding? || @io.eof? + + 10.times do + break true if @data_buf.valid_encoding? || @io.eof? + @data_buf += @io.read(1).force_encoding(raw_encoding) + end + end + + def data_buf_eof? + @io.eof? && !@data_buf + end + + # # Stores the indicated separators for later use. # # If auto-discovery was requested for @row_sep, this method will read @@ -2075,21 +2102,8 @@ # store the parser behaviors @skip_blanks = options.delete(:skip_blanks) @field_size_limit = options.delete(:field_size_limit) - - # prebuild Regexps for faster parsing - esc_row_sep = escape_re(@row_sep) - esc_quote = escape_re(@quote_char) - @parsers = { - # for detecting parse errors - quote_or_nl: encode_re("[", esc_quote, "\r\n]"), - nl_or_lf: encode_re("[\r\n]"), - stray_quote: encode_re( "[^", esc_quote, "]", esc_quote, - "[^", esc_quote, "]" ), - # safer than chomp!() - line_end: encode_re(esc_row_sep, "\\z"), - # illegal unquoted characters - return_newline: encode_str("\r\n") - } + @io_read_limit = options.delete(:io_read_limit) + @nl_lf = encode_str("\r\n") end #