[#406 state:resolved] Updated URI package in extlib to 1.56, the late…

…st in CPAN.
openmelody · Oct 22, 2010 · 58af58c · 58af58c
1 parent 6e30949
commit 58af58c
Show file tree

Hide file tree

Showing 18 changed files with 585 additions and 70 deletions.
diff --git a/extlib/URI.pm b/extlib/URI.pm
@@ -2,9 +2,9 @@ package URI;
 
 use strict;
 use vars qw($VERSION);
-$VERSION = "1.36";
+$VERSION = "1.56";
 
-use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME);
+use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME $DEFAULT_QUERY_FORM_DELIMITER);
 
 my %implements;  # mapping from scheme to implementor class
 
@@ -22,12 +22,16 @@ use Carp ();
 use URI::Escape ();
 
 use overload ('""'     => sub { ${$_[0]} },
-	      '=='     => sub { overload::StrVal($_[0]) eq
-                                overload::StrVal($_[1])
-                              },
+              '=='     => sub { _obj_eq(@_) },
+              '!='     => sub { !_obj_eq(@_) },
               fallback => 1,
              );
 
+# Check if two objects are the same object
+sub _obj_eq {
+    return overload::StrVal($_[0]) eq overload::StrVal($_[1]);
+}
+
 sub new
 {
     my($class, $uri, $scheme) = @_;
@@ -74,14 +78,22 @@ sub _init
     my $class = shift;
     my($str, $scheme) = @_;
     # find all funny characters and encode the bytes.
-    $str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
+    $str = $class->_uric_escape($str);
     $str = "$scheme:$str" unless $str =~ /^$scheme_re:/o ||
                                  $class->_no_scheme_ok;
     my $self = bless \$str, $class;
     $self;
 }
 
 
+sub _uric_escape
+{
+    my($class, $str) = @_;
+    $str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
+    return $str;
+}
+
+
 sub implementor
 {
     my($scheme, $impclass) = @_;
@@ -107,7 +119,7 @@ sub implementor
     # preloaded (with 'use') implementation
     $ic = "URI::$scheme";  # default location
 
-    # turn scheme into a valid perl identifier by a simple tranformation...
+    # turn scheme into a valid perl identifier by a simple transformation...
     $ic =~ s/\+/_P/g;
     $ic =~ s/\./_O/g;
     $ic =~ s/\-/_/g;
@@ -244,6 +256,34 @@ sub as_string
 }
 
 
+sub as_iri
+{
+    my $self = shift;
+    my $str = $$self;
+    if ($str =~ s/%([89a-fA-F][0-9a-fA-F])/chr(hex($1))/eg) {
+	# All this crap because the more obvious:
+	#
+	#   Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift })
+	#
+	# doesn't work before Encode 2.39.  Wait for a standard release
+	# to bundle that version.
+
+	require Encode;
+	my $enc = Encode::find_encoding("UTF-8");
+	my $u = "";
+	while (length $str) {
+	    $u .= $enc->decode($str, Encode::FB_QUIET());
+	    if (length $str) {
+		# escape next char
+		$u .= URI::Escape::escape_char(substr($str, 0, 1, ""));
+	    }
+	}
+	$str = $u;
+    }
+    return $str;
+}
+
+
 sub canonical
 {
     # Make sure scheme is lowercased, that we don't escape unreserved chars,
@@ -281,6 +321,8 @@ sub eq {
 sub abs { $_[0]; }
 sub rel { $_[0]; }
 
+sub secure { 0 }
+
 # help out Storable
 sub STORABLE_freeze {
        my($self, $cloning) = @_;
@@ -481,11 +523,17 @@ as an escaped string.
 
 =item $uri->as_string
 
-Returns a URI object to a plain string.  URI objects are
+Returns a URI object to a plain ASCII string.  URI objects are
 also converted to plain strings automatically by overloading.  This
 means that $uri objects can be used as plain strings in most Perl
 constructs.
 
+=item $uri->as_iri
+
+Returns a Unicode string representing the URI.  Escaped UTF-8 sequences
+representing non-ASCII characters are turned into their corresponding Unicode
+code point.
+
 =item $uri->canonical
 
 Returns a normalized version of the URI.  The rules
@@ -523,6 +571,11 @@ Returns a relative URI reference if it is possible to
 make one that denotes the same resource relative to $base_uri.
 If not, then $uri is simply returned.
 
+=item $uri->secure
+
+Returns a TRUE value if the URI is considered to point to a resource on
+a secure channel, such as an SSL or TLS encrypted one.
+
 =back
 
 =head1 GENERIC METHODS
@@ -586,10 +639,16 @@ the $uri.
 
 =item $uri->query_form( $key1 => $val1, $key2 => $val2, ... )
 
+=item $uri->query_form( $key1 => $val1, $key2 => $val2, ..., $delim )
+
 =item $uri->query_form( \@key_value_pairs )
 
+=item $uri->query_form( \@key_value_pairs, $delim )
+
 =item $uri->query_form( \%hash )
 
+=item $uri->query_form( \%hash, $delim )
+
 Sets and returns query components that use the
 I<application/x-www-form-urlencoded> format.  Key/value pairs are
 separated by "&", and the key is separated from the value by a "="
@@ -614,6 +673,13 @@ All the following statements have the same effect:
     $uri->query_form([ foo => [1, 2] ]);
     $uri->query_form({ foo => [1, 2] });
 
+The $delim parameter can be passed as ";" to force the key/value pairs
+to be delimited by ";" instead of "&" in the query string.  This
+practice is often recommended for URLs embedded in HTML or XML
+documents as this avoids the trouble of escaping the "&" character.
+You might also set the $URI::DEFAULT_QUERY_FORM_DELIMITER variable to
+";" for the same global effect.
+
 The C<URI::QueryParam> module can be loaded to add further methods to
 manipulate the form of a URI.  See L<URI::QueryParam> for details.
 
@@ -663,6 +729,15 @@ Sets and returns the unescaped hostname.
 If the $new_host string ends with a colon and a number, then this
 number also sets the port.
 
+For IPv6 addresses the brackets around the raw address is removed in the return
+value from $uri->host.  When setting the host attribute to an IPv6 address you
+can use a raw address or one enclosed in brackets.  The address needs to be
+enclosed in brackets if you want to pass in a new port value as well.
+
+=item $uri->ihost
+
+Returns the host in Unicode form.  Any IDNA A-labels are turned into U-labels.
+
 =item $uri->port
 
 =item $uri->port( $new_port )
@@ -683,6 +758,10 @@ unit.  The returned value includes a port, even if it matches the
 default port.  The host part and the port part are separated by a
 colon: ":".
 
+For IPv6 addresses the bracketing is preserved; thus
+URI->new("http://[::1]/")->host_port returns "[::1]:80".  Contrast this with
+$uri->host which will remove the brackets.
+
 =item $uri->default_port
 
 Returns the default port of the URI scheme to which $uri
@@ -794,9 +873,13 @@ C<URI> objects belonging to the mailto scheme support the common
 methods and the generic query methods.  In addition, they support the
 following mailto-specific methods: $uri->to, $uri->headers.
 
+Note that the "foo@example.com" part of a mailto is I<not> the
+C<userinfo> and C<host> but instead the C<path>.  This allows a
+mailto URI to contain multiple comma separated email addresses.
+
 =item B<mms>:
 
-The I<mms> URL specification can be found at L<http://sdp.ppona.com/>
+The I<mms> URL specification can be found at L<http://sdp.ppona.com/>.
 C<URI> objects belonging to the mms scheme support the common,
 generic, and server methods, with the exception of userinfo and
 query-related sub-components.
@@ -844,7 +927,7 @@ instead of TCP.  The syntax is the same as rtsp.
 
 =item B<rsync>:
 
-Information about rsync is available from http://rsync.samba.org.
+Information about rsync is available from L<http://rsync.samba.org/>.
 C<URI> objects belonging to the rsync scheme support the common,
 generic and server methods.  In addition, they provide methods to
 access the userinfo sub-components: $uri->user and $uri->password.
@@ -881,7 +964,7 @@ common, generic and server methods.
 
 =item B<ssh>:
 
-Information about ssh is available at http://www.openssh.com/.
+Information about ssh is available at L<http://www.openssh.com/>.
 C<URI> objects belonging to the ssh scheme support the common,
 generic and server methods. In addition, they provide methods to
 access the userinfo sub-components: $uri->user and $uri->password.
@@ -896,7 +979,7 @@ and the Namespace-Specific String respectively.
 The Namespace Identifier basically works like the Scheme identifier of
 URIs, and further divides the URN namespace.  Namespace Identifier
 assignments are maintained at
-<http://www.iana.org/assignments/urn-namespaces>.
+L<http://www.iana.org/assignments/urn-namespaces>.
 
 Letter case is not significant for the Namespace Identifier.  It is
 always returned in lower case by the $uri->nid method.  The $uri->_nid
@@ -958,6 +1041,11 @@ examples:
   URI->new("../../../foo")->abs("http://host/a/b")
       ==> "http://host/foo"
 
+=item $URI::DEFAULT_QUERY_FORM_DELIMITER
+
+This value can be set to ";" to have the query form C<key=value> pairs
+delimited by ";" instead of "&" which is the default.
+
 =back
 
 =head1 BUGS
@@ -989,15 +1077,15 @@ L<URI::Split>, L<URI::Heuristic>
 RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax",
 Berners-Lee, Fielding, Masinter, August 1998.
 
-http://www.iana.org/assignments/uri-schemes
+L<http://www.iana.org/assignments/uri-schemes>
 
-http://www.iana.org/assignments/urn-namespaces
+L<http://www.iana.org/assignments/urn-namespaces>
 
-http://www.w3.org/Addressing/
+L<http://www.w3.org/Addressing/>
 
 =head1 COPYRIGHT
 
-Copyright 1995-2004,2008 Gisle Aas.
+Copyright 1995-2009 Gisle Aas.
 
 Copyright 1995 Martijn Koster.
 

diff --git a/extlib/URI/Escape.pm b/extlib/URI/Escape.pm
@@ -15,26 +15,27 @@ URI::Escape - Escape and unescape unsafe characters
 =head1 DESCRIPTION
 
 This module provides functions to escape and unescape URI strings as
-defined by RFC 2396 (and updated by RFC 2732).
-A URI consists of a restricted set of characters,
-denoted as C<uric> in RFC 2396.  The restricted set of characters
-consists of digits, letters, and a few graphic symbols chosen from
-those common to most of the character encodings and input facilities
-available to Internet users:
+defined by RFC 3986.
 
-  "A" .. "Z", "a" .. "z", "0" .. "9",
-  ";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]",   # reserved
-  "-", "_", ".", "!", "~", "*", "'", "(", ")"
+A URI consists of a restricted set of characters.  The restricted set
+of characters consists of digits, letters, and a few graphic symbols
+chosen from those common to most of the character encodings and input
+facilities available to Internet users.  They are made up of the
+"unreserved" and "reserved" character sets as defined in RFC 3986.
+
+   unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+   reserved      = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+                   "!" / "$" / "&" / "'" / "(" / ")"
+                 / "*" / "+" / "," / ";" / "="
 
 In addition, any byte (octet) can be represented in a URI by an escape
 sequence: a triplet consisting of the character "%" followed by two
 hexadecimal digits.  A byte can also be represented directly by a
-character, using the US-ASCII character for that octet (iff the
-character is part of C<uric>).
+character, using the US-ASCII character for that octet.
 
-Some of the C<uric> characters are I<reserved> for use as delimiters
-or as part of certain URI components.  These must be escaped if they are
-to be treated as ordinary data.  Read RFC 2396 for further details.
+Some of the characters are I<reserved> for use as delimiters or as
+part of certain URI components.  These must be escaped if they are to
+be treated as ordinary data.  Read RFC 3986 for further details.
 
 The functions provided (and exported by default) from this module are:
 
@@ -61,10 +62,10 @@ character class (between [ ]).  E.g.:
   "^A-Za-z"                     # everything not a letter
 
 The default set of characters to be escaped is all those which are
-I<not> part of the C<uric> character class shown above as well as the
-reserved characters.  I.e. the default is:
+I<not> part of the C<unreserved> character class shown above as well
+as the reserved characters.  I.e. the default is:
 
-  "^A-Za-z0-9\-_.!~*'()"
+    "^A-Za-z0-9\-\._~"
 
 =item uri_escape_utf8( $string )
 
@@ -88,12 +89,12 @@ will be the same as:
 
 but will even work for perl-5.6 for chars in the 128 .. 255 range.
 
-Note: Javascript has a function called escape() that produce the
+Note: JavaScript has a function called escape() that produces the
 sequence "%uXXXX" for chars in the 256 .. 65535 range.  This function
 has really nothing to do with URI escaping but some folks got confused
 since it "does the right thing" in the 0 .. 255 range.  Because of
 this you sometimes see "URIs" with these kind of escapes.  The
-JavaScript encodeURI() function is similar to uri_escape_utf8().
+JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
 
 =item uri_unescape($string,...)
 
@@ -145,7 +146,7 @@ require Exporter;
 @ISA = qw(Exporter);
 @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
 @EXPORT_OK = qw(%escapes);
-$VERSION = "3.29";
+$VERSION = "3.30";
 
 use Carp ();
 
@@ -154,7 +155,12 @@ for (0..255) {
     $escapes{chr($_)} = sprintf("%%%02X", $_);
 }
 
-my %subst;  # compiled patternes
+my %subst;  # compiled patterns
+
+my %Unsafe = (
+    RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
+    RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
+);
 
 sub uri_escape
 {
@@ -169,8 +175,7 @@ sub uri_escape
 	}
 	&{$subst{$patn}}($text);
     } else {
-	# Default unsafe characters.  RFC 2732 ^(uric - reserved)
-	$text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge;
+	$text =~ s/($Unsafe{RFC3986})/$escapes{$1} || _fail_hi($1)/ge;
     }
     $text;
 }