Feature #4017 » ruby_19_csv_parser_split_methods.patch
| lib/csv.rb (working copy) | ||
|---|---|---|
|
# CSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String
|
||
|
# CSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr
|
||
|
# CSV($stdin) { |csv_in| csv_in.each { |row| p row } } # from $stdin
|
||
|
#
|
||
|
#
|
||
|
# == Advanced Usage
|
||
|
#
|
||
|
#
|
||
|
# === Wrap an IO Object
|
||
|
#
|
||
|
#
|
||
|
# csv = CSV.new(io, options)
|
||
|
# # ... read (with gets() or each()) from and write (with <<) to csv here ...
|
||
|
#
|
||
| ... | ... | |
|
#
|
||
|
# This method assumes you want the Table.headers(), unless you explicitly
|
||
|
# pass <tt>:write_headers => false</tt>.
|
||
|
#
|
||
|
#
|
||
|
def to_csv(options = Hash.new)
|
||
|
wh = options.fetch(:write_headers, true)
|
||
|
@table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
|
||
| ... | ... | |
|
# <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
|
||
|
# <b><tt>:quote_char</tt></b>:: <tt>'"'</tt>
|
||
|
# <b><tt>:field_size_limit</tt></b>:: +nil+
|
||
|
# <b><tt>:io_read_limit</tt></b>:: <tt>2048</tt>
|
||
|
# <b><tt>:converters</tt></b>:: +nil+
|
||
|
# <b><tt>:unconverted_fields</tt></b>:: +nil+
|
||
|
# <b><tt>:headers</tt></b>:: +false+
|
||
| ... | ... | |
|
row_sep: :auto,
|
||
|
quote_char: '"',
|
||
|
field_size_limit: nil,
|
||
|
io_read_limit: 2048,
|
||
|
converters: nil,
|
||
|
unconverted_fields: nil,
|
||
|
headers: false,
|
||
| ... | ... | |
|
# track our own lineno since IO gets confused about line-ends is CSV fields
|
||
|
@lineno = 0
|
||
|
@data_buf = nil
|
||
|
end
|
||
|
#
|
||
| ... | ... | |
|
def rewind
|
||
|
@headers = nil
|
||
|
@lineno = 0
|
||
|
@data_buf = nil
|
||
|
@io.rewind
|
||
|
end
|
||
| ... | ... | |
|
# The data source must be open for reading.
|
||
|
#
|
||
|
def shift
|
||
|
#########################################################################
|
||
|
### This method is purposefully kept a bit long as simple conditional ###
|
||
|
### checks are faster than numerous (expensive) method calls. ###
|
||
|
#########################################################################
|
||
|
# handle headers not based on document content
|
||
|
if header_row? and @return_headers and
|
||
|
[Array, String].include? @use_headers.class
|
||
| ... | ... | |
|
end
|
||
|
end
|
||
|
#
|
||
|
# it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
|
||
|
# because of \r and/or \n characters embedded in quoted fields
|
||
|
#
|
||
|
in_extended_col = false
|
||
|
csv = Array.new
|
||
|
@lineno += 1
|
||
|
csv = parse_csv_row
|
||
|
return unless csv
|
||
|
loop do
|
||
|
# add another read to the line
|
||
|
unless parse = @io.gets(@row_sep)
|
||
|
return nil
|
||
|
if csv == [nil]
|
||
|
if @skip_blanks
|
||
|
return shift
|
||
|
elsif @unconverted_fields
|
||
|
return add_unconverted_fields(Array.new, Array.new)
|
||
|
elsif @use_headers
|
||
|
return self.class::Row.new(Array.new, Array.new)
|
||
|
else
|
||
|
return Array.new
|
||
|
end
|
||
|
end
|
||
|
parse.sub!(@parsers[:line_end], "")
|
||
|
# save fields unconverted fields, if needed...
|
||
|
unconverted = csv.dup if @unconverted_fields
|
||
|
if csv.empty?
|
||
|
#
|
||
|
# I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
|
||
|
# CSV's <tt>[nil]</tt>
|
||
|
#
|
||
|
if parse.empty?
|
||
|
@lineno += 1
|
||
|
if @skip_blanks
|
||
|
next
|
||
|
elsif @unconverted_fields
|
||
|
return add_unconverted_fields(Array.new, Array.new)
|
||
|
elsif @use_headers
|
||
|
return self.class::Row.new(Array.new, Array.new)
|
||
|
else
|
||
|
return Array.new
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
# convert fields, if needed...
|
||
|
csv = convert_fields(csv) unless @use_headers or @converters.empty?
|
||
|
# parse out header rows and handle CSV::Row conversions...
|
||
|
csv = parse_headers(csv) if @use_headers
|
||
|
parts = parse.split(@col_sep, -1)
|
||
|
if parts.empty?
|
||
|
if in_extended_col
|
||
|
csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop
|
||
|
else
|
||
|
csv << nil
|
||
|
end
|
||
|
end
|
||
|
# inject unconverted fields and accessor, if requested...
|
||
|
if @unconverted_fields and not csv.respond_to? :unconverted_fields
|
||
|
add_unconverted_fields(csv, unconverted)
|
||
|
end
|
||
|
# This loop is the hot path of csv parsing. Some things may be non-dry
|
||
|
# for a reason. Make sure to benchmark when refactoring.
|
||
|
parts.each do |part|
|
||
|
if in_extended_col
|
||
|
# If we are continuing a previous column
|
||
|
if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
|
||
|
# extended column ends
|
||
|
csv.last << part[0..-2]
|
||
|
raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
|
||
|
csv.last.gsub!(@quote_char * 2, @quote_char)
|
||
|
in_extended_col = false
|
||
|
else
|
||
|
csv.last << part
|
||
|
csv.last << @col_sep
|
||
|
end
|
||
|
elsif part[0] == @quote_char
|
||
|
# If we are staring a new quoted column
|
||
|
if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
|
||
|
# start an extended column
|
||
|
csv << part[1..-1]
|
||
|
csv.last << @col_sep
|
||
|
in_extended_col = true
|
||
|
else
|
||
|
# regular quoted column
|
||
|
csv << part[1..-2]
|
||
|
raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
|
||
|
csv.last.gsub!(@quote_char * 2, @quote_char)
|
||
|
end
|
||
|
elsif part =~ @parsers[:quote_or_nl]
|
||
|
# Unquoted field with bad characters.
|
||
|
if part =~ @parsers[:nl_or_lf]
|
||
|
raise MalformedCSVError, "Unquoted fields do not allow " +
|
||
|
"\\r or \\n (line #{lineno + 1})."
|
||
|
else
|
||
|
raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
|
||
|
end
|
||
|
else
|
||
|
# Regular ole unquoted field.
|
||
|
csv << (part.empty? ? nil : part)
|
||
|
end
|
||
|
end
|
||
|
# Replace tacked on @col_sep with @row_sep if we are still in an extended
|
||
|
# column.
|
||
|
csv[-1][-1] = @row_sep if in_extended_col
|
||
|
if in_extended_col
|
||
|
# if we're at eof?(), a quoted field wasn't closed...
|
||
|
if @io.eof?
|
||
|
raise MalformedCSVError,
|
||
|
"Unclosed quoted field on line #{lineno + 1}."
|
||
|
elsif @field_size_limit and csv.last.size >= @field_size_limit
|
||
|
raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
|
||
|
end
|
||
|
# otherwise, we need to loop and pull some more data to complete the row
|
||
|
else
|
||
|
@lineno += 1
|
||
|
# save fields unconverted fields, if needed...
|
||
|
unconverted = csv.dup if @unconverted_fields
|
||
|
# convert fields, if needed...
|
||
|
csv = convert_fields(csv) unless @use_headers or @converters.empty?
|
||
|
# parse out header rows and handle CSV::Row conversions...
|
||
|
csv = parse_headers(csv) if @use_headers
|
||
|
# inject unconverted fields and accessor, if requested...
|
||
|
if @unconverted_fields and not csv.respond_to? :unconverted_fields
|
||
|
add_unconverted_fields(csv, unconverted)
|
||
|
end
|
||
|
# return the results
|
||
|
break csv
|
||
|
end
|
||
|
end
|
||
|
# return the results
|
||
|
csv
|
||
|
end
|
||
|
alias_method :gets, :shift
|
||
|
alias_method :readline, :shift
|
||
| ... | ... | |
|
private
|
||
|
def parse_csv_row
|
||
|
buf = io_get_unquoted
|
||
|
return unless buf
|
||
|
line = []
|
||
|
loop do
|
||
|
case buf
|
||
|
when @quote_char
|
||
|
line << io_get_quoted
|
||
|
buf = io_get_unquoted
|
||
|
case buf
|
||
|
when nil, @row_sep
|
||
|
return line
|
||
|
when @col_sep
|
||
|
return line << nil
|
||
|
end
|
||
|
break unless buf.slice!(0, @col_sep.size) == @col_sep
|
||
|
when @row_sep
|
||
|
return line << nil
|
||
|
else
|
||
|
newline = buf.chomp! @row_sep
|
||
|
if buf.count(@nl_lf) > 0
|
||
|
raise MalformedCSVError, "Unquoted fields do not allow " +
|
||
|
"\\r or \\n (line #{@lineno})."
|
||
|
end
|
||
|
buf.split(@col_sep, -1).each{ |c| line << (c.empty? ? nil : c) }
|
||
|
if newline
|
||
|
return line
|
||
|
elsif line.last == @quote_char
|
||
|
buf = line.pop
|
||
|
elsif data_buf_eof?
|
||
|
return line
|
||
|
else
|
||
|
break
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
|
||
|
end
|
||
|
# Read @io and return everything until the next unescaped quote character.
|
||
|
# Escaped quote characters are automatically unescaped. Raises an error if we
|
||
|
# hit @io.eof? or @field_size_limit without encountering an ending quote.
|
||
|
#
|
||
|
# Only successfully returns if the read ended with a @quote_char. This
|
||
|
# prevents us from returning only part of a multibyte character.
|
||
|
def io_get_quoted
|
||
|
buf = @io.gets @quote_char, @field_size_limit
|
||
|
while buf.chomp!(@quote_char) do
|
||
|
@data_buf = @io.gets @quote_char, @io_read_limit
|
||
|
return buf unless @data_buf == @quote_char
|
||
|
break if @io.eof?
|
||
|
buf << @quote_char + @io.gets(@quote_char, @field_size_limit)
|
||
|
end
|
||
|
raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
|
||
|
end
|
||
|
# Read @io and return everything until we hit a newline or a quote character.
|
||
|
# Raise an error if we exceed @field_size_limit.
|
||
|
#
|
||
|
# Only successfully returns if the read contains @row_sep, ends with a
|
||
|
# @quote_char, or reaches @io.eof? This prevents us from returning only part
|
||
|
# of a multibyte character.
|
||
|
#
|
||
|
# If we have multibyte encoding then it is possible that a @quote_char will be
|
||
|
# truncated on @io.gets. We only check for this if we exhaust the data buffer.
|
||
|
def io_get_unquoted
|
||
|
unless @data_buf
|
||
|
return unless @data_buf = @io.gets(@quote_char, @io_read_limit)
|
||
|
end
|
||
|
loop do
|
||
|
if newline = @data_buf.index(@row_sep)
|
||
|
break if newline == @data_buf.size - @row_sep.size
|
||
|
return @data_buf.slice!(0, newline + @row_sep.size)
|
||
|
end
|
||
|
break if @io.eof? || @data_buf.end_with?(@quote_char)
|
||
|
if @field_size_limit && @data_buf.size > @field_size_limit
|
||
|
raise MalformedCSVError, "Field size exceeded on line #{@lineno}."
|
||
|
end
|
||
|
@data_buf += @io.gets(@quote_char, @io_read_limit) unless align_data_buf
|
||
|
end
|
||
|
return_buf = @data_buf
|
||
|
@data_buf = nil
|
||
|
return_buf
|
||
|
end
|
||
|
# Fetch up to 10 bytes into @data_buf until the string matches its encoding.
|
||
|
# Returns a value only if we modified the buffer.
|
||
|
def align_data_buf
|
||
|
return if @data_buf.valid_encoding? || @io.eof?
|
||
|
10.times do
|
||
|
break true if @data_buf.valid_encoding? || @io.eof?
|
||
|
@data_buf += @io.read(1).force_encoding(raw_encoding)
|
||
|
end
|
||
|
end
|
||
|
def data_buf_eof?
|
||
|
@io.eof? && !@data_buf
|
||
|
end
|
||
|
#
|
||
|
# Stores the indicated separators for later use.
|
||
|
#
|
||
|
# If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
|
||
| ... | ... | |
|
# store the parser behaviors
|
||
|
@skip_blanks = options.delete(:skip_blanks)
|
||
|
@field_size_limit = options.delete(:field_size_limit)
|
||
|
# prebuild Regexps for faster parsing
|
||
|
esc_row_sep = escape_re(@row_sep)
|
||
|
esc_quote = escape_re(@quote_char)
|
||
|
@parsers = {
|
||
|
# for detecting parse errors
|
||
|
quote_or_nl: encode_re("[", esc_quote, "\r\n]"),
|
||
|
nl_or_lf: encode_re("[\r\n]"),
|
||
|
stray_quote: encode_re( "[^", esc_quote, "]", esc_quote,
|
||
|
"[^", esc_quote, "]" ),
|
||
|
# safer than chomp!()
|
||
|
line_end: encode_re(esc_row_sep, "\\z"),
|
||
|
# illegal unquoted characters
|
||
|
return_newline: encode_str("\r\n")
|
||
|
}
|
||
|
@io_read_limit = options.delete(:io_read_limit)
|
||
|
@nl_lf = encode_str("\r\n")
|
||
|
end
|
||
|
#
|
||