Bug #1787 » lib_uri_common-r2.patch
lib/uri/common.rb (working copy) | ||
---|---|---|
#
|
||
# Author:: Akira Yamada <akira@ruby-lang.org>
|
||
# Revision:: $Id$
|
||
# License::
|
||
# License::
|
||
# You can redistribute it and/or modify it under the same term as Ruby.
|
||
#
|
||
... | ... | |
# alpha = lowalpha | upalpha
|
||
ALPHA = "a-zA-Z"
|
||
# digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | 7" |
|
||
# "8" | "9"
|
||
DIGIT = "0-9"
|
||
# alphanum = alpha | digit
|
||
ALNUM = "#{ALPHA}\\d"
|
||
ALNUM = "#{ALPHA}#{DIGIT}"
|
||
# hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
|
||
# "a" | "b" | "c" | "d" | "e" | "f"
|
||
HEX = "a-fA-F\\d"
|
||
HEX = "#{DIGIT}a-fA-F"
|
||
# escaped = "%" hex hex
|
||
ESCAPED = "%[#{HEX}]{2}"
|
||
# mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
|
||
... | ... | |
UNRESERVED = "-_.!~*'()#{ALNUM}"
|
||
# reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||
# "$" | ","
|
||
# reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||
# reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
||
# "$" | "," | "[" | "]" (RFC 2732)
|
||
RESERVED = ";/?:@&=+$,\\[\\]"
|
||
... | ... | |
def split(uri)
|
||
case uri
|
||
when ''
|
||
# null uri
|
||
# null uri
|
||
when @regexp[:ABS_URI]
|
||
scheme, opaque, userinfo, host, port,
|
||
registry, path, query, fragment = $~[1..-1]
|
||
scheme, opaque, userinfo, host, port,
|
||
registry, path, query, fragment = $~[1..-1]
|
||
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
||
# absoluteURI = scheme ":" ( hier_part | opaque_part )
|
||
# hier_part = ( net_path | abs_path ) [ "?" query ]
|
||
# opaque_part = uric_no_slash *uric
|
||
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
||
# abs_path = "/" path_segments
|
||
# net_path = "//" authority [ abs_path ]
|
||
# absoluteURI = scheme ":" ( hier_part | opaque_part )
|
||
# hier_part = ( net_path | abs_path ) [ "?" query ]
|
||
# opaque_part = uric_no_slash *uric
|
||
# abs_path = "/" path_segments
|
||
# net_path = "//" authority [ abs_path ]
|
||
# authority = server | reg_name
|
||
# server = [ [ userinfo "@" ] hostport ]
|
||
if !scheme
|
||
raise InvalidURIError,
|
||
"bad URI(absolute but no scheme): #{uri}"
|
||
end
|
||
if !opaque && (!path && (!host && !registry))
|
||
raise InvalidURIError,
|
||
"bad URI(absolute but no path): #{uri}"
|
||
end
|
||
# authority = server | reg_name
|
||
# server = [ [ userinfo "@" ] hostport ]
|
||
if !scheme
|
||
raise InvalidURIError,
|
||
"bad URI(absolute but no scheme): #{uri}"
|
||
end
|
||
if !opaque && (!path && (!host && !registry))
|
||
raise InvalidURIError,
|
||
"bad URI(absolute but no path): #{uri}"
|
||
end
|
||
when @regexp[:REL_URI]
|
||
scheme = nil
|
||
opaque = nil
|
||
scheme = nil
|
||
opaque = nil
|
||
userinfo, host, port, registry,
|
||
rel_segment, abs_path, query, fragment = $~[1..-1]
|
||
if rel_segment && abs_path
|
||
path = rel_segment + abs_path
|
||
elsif rel_segment
|
||
path = rel_segment
|
||
elsif abs_path
|
||
path = abs_path
|
||
end
|
||
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
||
# relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
|
||
userinfo, host, port, registry,
|
||
rel_segment, abs_path, query, fragment = $~[1..-1]
|
||
if rel_segment && abs_path
|
||
path = rel_segment + abs_path
|
||
elsif rel_segment
|
||
path = rel_segment
|
||
elsif abs_path
|
||
path = abs_path
|
||
end
|
||
# URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
||
# relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
|
||
# net_path = "//" authority [ abs_path ]
|
||
# abs_path = "/" path_segments
|
||
# rel_path = rel_segment [ abs_path ]
|
||
# net_path = "//" authority [ abs_path ]
|
||
# abs_path = "/" path_segments
|
||
# rel_path = rel_segment [ abs_path ]
|
||
# authority = server | reg_name
|
||
# server = [ [ userinfo "@" ] hostport ]
|
||
# authority = server | reg_name
|
||
# server = [ [ userinfo "@" ] hostport ]
|
||
else
|
||
raise InvalidURIError, "bad URI(is not URI?): #{uri}"
|
||
raise InvalidURIError, "bad URI(is not URI?): #{uri}"
|
||
end
|
||
path = '' if !path && !opaque # (see RFC2396 Section 5.2)
|
||
ret = [
|
||
scheme,
|
||
userinfo, host, port, # X
|
||
registry, # X
|
||
path, # Y
|
||
opaque, # Y
|
||
query,
|
||
fragment
|
||
scheme,
|
||
userinfo, host, port, # X
|
||
registry, # X
|
||
path, # Y
|
||
opaque, # Y
|
||
query,
|
||
fragment
|
||
]
|
||
return ret
|
||
end
|
||
def parse(uri)
|
||
scheme, userinfo, host, port,
|
||
registry, path, opaque, query, fragment = self.split(uri)
|
||
scheme, userinfo, host, port,
|
||
registry, path, opaque, query, fragment = self.split(uri)
|
||
if scheme && URI.scheme_list.include?(scheme.upcase)
|
||
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
|
||
registry, path, opaque, query,
|
||
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
|
||
registry, path, opaque, query,
|
||
fragment, self)
|
||
else
|
||
Generic.new(scheme, userinfo, host, port,
|
||
registry, path, opaque, query,
|
||
fragment, self)
|
||
Generic.new(scheme, userinfo, host, port,
|
||
registry, path, opaque, query,
|
||
fragment, self)
|
||
end
|
||
end
|
||
def join(*str)
|
||
u = self.parse(str[0])
|
||
str[1 .. -1].each do |x|
|
||
u = u.merge(x)
|
||
u = u.merge(x)
|
||
end
|
||
u
|
||
end
|
||
def extract(str, schemes = nil, &block)
|
||
if block_given?
|
||
str.scan(make_regexp(schemes)) { yield $& }
|
||
nil
|
||
str.scan(make_regexp(schemes)) { yield $& }
|
||
nil
|
||
else
|
||
result = []
|
||
str.scan(make_regexp(schemes)) { result.push $& }
|
||
result
|
||
result = []
|
||
str.scan(make_regexp(schemes)) { result.push $& }
|
||
result
|
||
end
|
||
end
|
||
def make_regexp(schemes = nil)
|
||
unless schemes
|
||
@regexp[:ABS_URI_REF]
|
||
@regexp[:ABS_URI_REF]
|
||
else
|
||
/(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
|
||
/(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
|
||
end
|
||
end
|
||
... | ... | |
# hostname = *( domainlabel "." ) toplabel [ "." ]
|
||
unless hostname
|
||
ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?"
|
||
ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?"
|
||
end
|
||
# RFC 2373, APPENDIX B:
|
||
... | ... | |
# allowed too. Here is a replacement.
|
||
#
|
||
# IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
|
||
ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
|
||
v4digit = "#{PATTERN::DIGIT}{1,3}"
|
||
ipv4addr = "(?:#{v4digit}\\.){3}#{v4digit}"
|
||
ret[:IPV4ADDR] = ipv4addr
|
||
# hex4 = 1*4HEXDIG
|
||
hex4 = "[#{PATTERN::HEX}]{1,4}"
|
||
# lastpart = hex4 | IPv4address
|
||
... | ... | |
# host = hostname | IPv4address | IPv6reference (RFC 2732)
|
||
ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
|
||
# port = *digit
|
||
port = '\d*'
|
||
ret[:PORT] = port = "[#{PATTERN::DIGIT}]*"
|
||
# hostport = host [ ":" port ]
|
||
ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
|
||
hostport = "#{host}(?::#{port})?"
|
||
# userinfo = *( unreserved | escaped |
|
||
# ";" | ":" | "&" | "=" | "+" | "$" | "," )
|
||
... | ... | |
ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
|
||
# scheme = alpha *( alpha | digit | "+" | "-" | "." )
|
||
ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}\\d]*"
|
||
ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}#{PATTERN::DIGIT}]*"
|
||
# abs_path = "/" path_segments
|
||
ret[:ABS_PATH] = abs_path = "/#{path_segments}"
|
||
... | ... | |
ret[:URI_REF] = uri_ref = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
|
||
ret[:X_ABS_URI] = "
|
||
(#{scheme}): (?# 1: scheme)
|
||
(#{scheme}): (?# 1: scheme)
|
||
(?:
|
||
(#{opaque_part}) (?# 2: opaque)
|
||
(#{opaque_part}) (?# 2: opaque)
|
||
|
|
||
(?:(?:
|
||
//(?:
|
||
(?:(?:(#{userinfo})@)? (?# 3: userinfo)
|
||
(?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
|
||
(?:(?:(#{userinfo})@)? (?# 3: userinfo)
|
||
(?:(#{host}) (?# 4: host)
|
||
(?::([#{PATTERN::DIGIT}]*))?))? (?# 5: port)
|
||
|
|
||
(#{reg_name}) (?# 6: registry)
|
||
(#{reg_name}) (?# 6: registry)
|
||
)
|
||
|
|
||
(?!//)) (?# XXX: '//' is the mark for hostport)
|
||
(#{abs_path})? (?# 7: path)
|
||
)(?:\\?(#{query}))? (?# 8: query)
|
||
(?!//)) (?# XXX: '//' is the mark for hostport)
|
||
(#{abs_path})? (?# 7: path)
|
||
)(?:\\?(#{query}))? (?# 8: query)
|
||
)
|
||
(?:\\#(#{fragment}))? (?# 9: fragment)
|
||
(?:\\#(#{fragment}))? (?# 9: fragment)
|
||
"
|
||
ret[:X_REL_URI] = "
|
||
... | ... | |
(?:
|
||
//
|
||
(?:
|
||
(?:(#{userinfo})@)? (?# 1: userinfo)
|
||
(#{host})?(?::(\\d*))? (?# 2: host, 3: port)
|
||
(?:(#{userinfo})@)? (?# 1: userinfo)
|
||
(#{host})? (?# 2: host)
|
||
(?::([#{PATTERN::DIGIT}]*))? (?# 3: port)
|
||
|
|
||
(#{reg_name}) (?# 4: registry)
|
||
(#{reg_name}) (?# 4: registry)
|
||
)
|
||
)
|
||
|
|
||
(#{rel_segment}) (?# 5: rel_segment)
|
||
(#{rel_segment}) (?# 5: rel_segment)
|
||
)?
|
||
(#{abs_path})? (?# 6: abs_path)
|
||
(?:\\?(#{query}))? (?# 7: query)
|
||
(?:\\#(#{fragment}))? (?# 8: fragment)
|
||
(#{abs_path})? (?# 6: abs_path)
|
||
(?:\\?(#{query}))? (?# 7: query)
|
||
(?:\\#(#{fragment}))? (?# 8: fragment)
|
||
"
|
||
ret
|
||
... | ... | |
end
|
||
end
|
||
else
|
||
raise ArgumentError,
|
||
raise ArgumentError,
|
||
"expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
|
||
end
|
||
tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
|
||
... | ... | |
def self.scheme_list
|
||
@@schemes
|
||
end
|
||
|
||
#
|
||
# Base class for all URI exceptions.
|
||
#
|
||
... | ... | |
# * Opaque
|
||
# * Query
|
||
# * Fragment
|
||
#
|
||
#
|
||
# == Usage
|
||
#
|
||
# require 'uri'
|
||
... | ... | |
# == Description
|
||
#
|
||
# Creates one of the URI's subclasses instance from the string.
|
||
#
|
||
#
|
||
# == Raises
|
||
#
|
||
# URI::InvalidURIError
|
||
... | ... | |
# uri = URI.parse("http://www.ruby-lang.org/")
|
||
# p uri
|
||
# # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
|
||
# p uri.scheme
|
||
# # => "http"
|
||
# p uri.host
|
||
# # => "www.ruby-lang.org"
|
||
#
|
||
# p uri.scheme
|
||
# # => "http"
|
||
# p uri.host
|
||
# # => "www.ruby-lang.org"
|
||
#
|
||
def self.parse(uri)
|
||
DEFAULT_PARSER.parse(uri)
|
||
end
|
||
... | ... | |
#
|
||
# == Args
|
||
#
|
||
# +str+::
|
||
# +str+::
|
||
# String to extract URIs from.
|
||
# +schemes+::
|
||
# Limit URI matching to a specific schemes.
|
||
... | ... | |
#
|
||
# == Args
|
||
#
|
||
# +match_schemes+::
|
||
# +match_schemes+::
|
||
# Array of schemes. If given, resulting regexp matches to URIs
|
||
# whose scheme is one of the match_schemes.
|
||
#
|
||
#
|
||
# == Description
|
||
# Returns a Regexp object which matches to URI-like strings.
|
||
# The Regexp object returned by this method includes arbitrary
|
||
# number of capture group (parentheses). Never rely on it's number.
|
||
#
|
||
#
|
||
# == Usage
|
||
#
|
||
# require 'uri'
|
||
#
|
||
# # extract first URI from html_string
|
||
# html_string.slice(URI.regexp)
|
||
#
|
||
#
|
||
# # remove ftp URIs
|
||
# html_string.sub(URI.regexp(['ftp'])
|
||
#
|
||
#
|
||
# # You should not rely on the number of parentheses
|
||
# html_string.scan(URI.regexp) do |*matches|
|
||
# p $&
|