Skip to content

Commit

Permalink
[#406 state:resolved] Updated URI package in extlib to 1.56, the late…
Browse files Browse the repository at this point in the history
…st in CPAN.
  • Loading branch information
tima committed Oct 22, 2010
1 parent 6e30949 commit 58af58c
Show file tree
Hide file tree
Showing 18 changed files with 585 additions and 70 deletions.
120 changes: 104 additions & 16 deletions extlib/URI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package URI;

use strict;
use vars qw($VERSION);
$VERSION = "1.36";
$VERSION = "1.56";

use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME);
use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME $DEFAULT_QUERY_FORM_DELIMITER);

my %implements; # mapping from scheme to implementor class

Expand All @@ -22,12 +22,16 @@ use Carp ();
use URI::Escape ();

use overload ('""' => sub { ${$_[0]} },
'==' => sub { overload::StrVal($_[0]) eq
overload::StrVal($_[1])
},
'==' => sub { _obj_eq(@_) },
'!=' => sub { !_obj_eq(@_) },
fallback => 1,
);

# Check if two objects are the same object
sub _obj_eq {
return overload::StrVal($_[0]) eq overload::StrVal($_[1]);
}

sub new
{
my($class, $uri, $scheme) = @_;
Expand Down Expand Up @@ -74,14 +78,22 @@ sub _init
my $class = shift;
my($str, $scheme) = @_;
# find all funny characters and encode the bytes.
$str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
$str = $class->_uric_escape($str);
$str = "$scheme:$str" unless $str =~ /^$scheme_re:/o ||
$class->_no_scheme_ok;
my $self = bless \$str, $class;
$self;
}


sub _uric_escape
{
my($class, $str) = @_;
$str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
return $str;
}


sub implementor
{
my($scheme, $impclass) = @_;
Expand All @@ -107,7 +119,7 @@ sub implementor
# preloaded (with 'use') implementation
$ic = "URI::$scheme"; # default location

# turn scheme into a valid perl identifier by a simple tranformation...
# turn scheme into a valid perl identifier by a simple transformation...
$ic =~ s/\+/_P/g;
$ic =~ s/\./_O/g;
$ic =~ s/\-/_/g;
Expand Down Expand Up @@ -244,6 +256,34 @@ sub as_string
}


sub as_iri
{
my $self = shift;
my $str = $$self;
if ($str =~ s/%([89a-fA-F][0-9a-fA-F])/chr(hex($1))/eg) {
# All this crap because the more obvious:
#
# Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift })
#
# doesn't work before Encode 2.39. Wait for a standard release
# to bundle that version.

require Encode;
my $enc = Encode::find_encoding("UTF-8");
my $u = "";
while (length $str) {
$u .= $enc->decode($str, Encode::FB_QUIET());
if (length $str) {
# escape next char
$u .= URI::Escape::escape_char(substr($str, 0, 1, ""));
}
}
$str = $u;
}
return $str;
}


sub canonical
{
# Make sure scheme is lowercased, that we don't escape unreserved chars,
Expand Down Expand Up @@ -281,6 +321,8 @@ sub eq {
sub abs { $_[0]; }
sub rel { $_[0]; }

sub secure { 0 }

# help out Storable
sub STORABLE_freeze {
my($self, $cloning) = @_;
Expand Down Expand Up @@ -481,11 +523,17 @@ as an escaped string.
=item $uri->as_string
Returns a URI object to a plain string. URI objects are
Returns a URI object to a plain ASCII string. URI objects are
also converted to plain strings automatically by overloading. This
means that $uri objects can be used as plain strings in most Perl
constructs.
=item $uri->as_iri
Returns a Unicode string representing the URI. Escaped UTF-8 sequences
representing non-ASCII characters are turned into their corresponding Unicode
code point.
=item $uri->canonical
Returns a normalized version of the URI. The rules
Expand Down Expand Up @@ -523,6 +571,11 @@ Returns a relative URI reference if it is possible to
make one that denotes the same resource relative to $base_uri.
If not, then $uri is simply returned.
=item $uri->secure
Returns a TRUE value if the URI is considered to point to a resource on
a secure channel, such as an SSL or TLS encrypted one.
=back
=head1 GENERIC METHODS
Expand Down Expand Up @@ -586,10 +639,16 @@ the $uri.
=item $uri->query_form( $key1 => $val1, $key2 => $val2, ... )
=item $uri->query_form( $key1 => $val1, $key2 => $val2, ..., $delim )
=item $uri->query_form( \@key_value_pairs )
=item $uri->query_form( \@key_value_pairs, $delim )
=item $uri->query_form( \%hash )
=item $uri->query_form( \%hash, $delim )
Sets and returns query components that use the
I<application/x-www-form-urlencoded> format. Key/value pairs are
separated by "&", and the key is separated from the value by a "="
Expand All @@ -614,6 +673,13 @@ All the following statements have the same effect:
$uri->query_form([ foo => [1, 2] ]);
$uri->query_form({ foo => [1, 2] });
The $delim parameter can be passed as ";" to force the key/value pairs
to be delimited by ";" instead of "&" in the query string. This
practice is often recommended for URLs embedded in HTML or XML
documents as this avoids the trouble of escaping the "&" character.
You might also set the $URI::DEFAULT_QUERY_FORM_DELIMITER variable to
";" for the same global effect.
The C<URI::QueryParam> module can be loaded to add further methods to
manipulate the form of a URI. See L<URI::QueryParam> for details.
Expand Down Expand Up @@ -663,6 +729,15 @@ Sets and returns the unescaped hostname.
If the $new_host string ends with a colon and a number, then this
number also sets the port.
For IPv6 addresses the brackets around the raw address is removed in the return
value from $uri->host. When setting the host attribute to an IPv6 address you
can use a raw address or one enclosed in brackets. The address needs to be
enclosed in brackets if you want to pass in a new port value as well.
=item $uri->ihost
Returns the host in Unicode form. Any IDNA A-labels are turned into U-labels.
=item $uri->port
=item $uri->port( $new_port )
Expand All @@ -683,6 +758,10 @@ unit. The returned value includes a port, even if it matches the
default port. The host part and the port part are separated by a
colon: ":".
For IPv6 addresses the bracketing is preserved; thus
URI->new("http://[::1]/")->host_port returns "[::1]:80". Contrast this with
$uri->host which will remove the brackets.
=item $uri->default_port
Returns the default port of the URI scheme to which $uri
Expand Down Expand Up @@ -794,9 +873,13 @@ C<URI> objects belonging to the mailto scheme support the common
methods and the generic query methods. In addition, they support the
following mailto-specific methods: $uri->to, $uri->headers.
Note that the "foo@example.com" part of a mailto is I<not> the
C<userinfo> and C<host> but instead the C<path>. This allows a
mailto URI to contain multiple comma separated email addresses.
=item B<mms>:
The I<mms> URL specification can be found at L<http://sdp.ppona.com/>
The I<mms> URL specification can be found at L<http://sdp.ppona.com/>.
C<URI> objects belonging to the mms scheme support the common,
generic, and server methods, with the exception of userinfo and
query-related sub-components.
Expand Down Expand Up @@ -844,7 +927,7 @@ instead of TCP. The syntax is the same as rtsp.
=item B<rsync>:
Information about rsync is available from http://rsync.samba.org.
Information about rsync is available from L<http://rsync.samba.org/>.
C<URI> objects belonging to the rsync scheme support the common,
generic and server methods. In addition, they provide methods to
access the userinfo sub-components: $uri->user and $uri->password.
Expand Down Expand Up @@ -881,7 +964,7 @@ common, generic and server methods.
=item B<ssh>:
Information about ssh is available at http://www.openssh.com/.
Information about ssh is available at L<http://www.openssh.com/>.
C<URI> objects belonging to the ssh scheme support the common,
generic and server methods. In addition, they provide methods to
access the userinfo sub-components: $uri->user and $uri->password.
Expand All @@ -896,7 +979,7 @@ and the Namespace-Specific String respectively.
The Namespace Identifier basically works like the Scheme identifier of
URIs, and further divides the URN namespace. Namespace Identifier
assignments are maintained at
<http://www.iana.org/assignments/urn-namespaces>.
L<http://www.iana.org/assignments/urn-namespaces>.
Letter case is not significant for the Namespace Identifier. It is
always returned in lower case by the $uri->nid method. The $uri->_nid
Expand Down Expand Up @@ -958,6 +1041,11 @@ examples:
URI->new("../../../foo")->abs("http://host/a/b")
==> "http://host/foo"
=item $URI::DEFAULT_QUERY_FORM_DELIMITER
This value can be set to ";" to have the query form C<key=value> pairs
delimited by ";" instead of "&" which is the default.
=back
=head1 BUGS
Expand Down Expand Up @@ -989,15 +1077,15 @@ L<URI::Split>, L<URI::Heuristic>
RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax",
Berners-Lee, Fielding, Masinter, August 1998.
http://www.iana.org/assignments/uri-schemes
L<http://www.iana.org/assignments/uri-schemes>
http://www.iana.org/assignments/urn-namespaces
L<http://www.iana.org/assignments/urn-namespaces>
http://www.w3.org/Addressing/
L<http://www.w3.org/Addressing/>
=head1 COPYRIGHT
Copyright 1995-2004,2008 Gisle Aas.
Copyright 1995-2009 Gisle Aas.
Copyright 1995 Martijn Koster.
Expand Down
51 changes: 28 additions & 23 deletions extlib/URI/Escape.pm
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,27 @@ URI::Escape - Escape and unescape unsafe characters
=head1 DESCRIPTION
This module provides functions to escape and unescape URI strings as
defined by RFC 2396 (and updated by RFC 2732).
A URI consists of a restricted set of characters,
denoted as C<uric> in RFC 2396. The restricted set of characters
consists of digits, letters, and a few graphic symbols chosen from
those common to most of the character encodings and input facilities
available to Internet users:
defined by RFC 3986.
"A" .. "Z", "a" .. "z", "0" .. "9",
";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]", # reserved
"-", "_", ".", "!", "~", "*", "'", "(", ")"
A URI consists of a restricted set of characters. The restricted set
of characters consists of digits, letters, and a few graphic symbols
chosen from those common to most of the character encodings and input
facilities available to Internet users. They are made up of the
"unreserved" and "reserved" character sets as defined in RFC 3986.
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@"
"!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="
In addition, any byte (octet) can be represented in a URI by an escape
sequence: a triplet consisting of the character "%" followed by two
hexadecimal digits. A byte can also be represented directly by a
character, using the US-ASCII character for that octet (iff the
character is part of C<uric>).
character, using the US-ASCII character for that octet.
Some of the C<uric> characters are I<reserved> for use as delimiters
or as part of certain URI components. These must be escaped if they are
to be treated as ordinary data. Read RFC 2396 for further details.
Some of the characters are I<reserved> for use as delimiters or as
part of certain URI components. These must be escaped if they are to
be treated as ordinary data. Read RFC 3986 for further details.
The functions provided (and exported by default) from this module are:
Expand All @@ -61,10 +62,10 @@ character class (between [ ]). E.g.:
"^A-Za-z" # everything not a letter
The default set of characters to be escaped is all those which are
I<not> part of the C<uric> character class shown above as well as the
reserved characters. I.e. the default is:
I<not> part of the C<unreserved> character class shown above as well
as the reserved characters. I.e. the default is:
"^A-Za-z0-9\-_.!~*'()"
"^A-Za-z0-9\-\._~"
=item uri_escape_utf8( $string )
Expand All @@ -88,12 +89,12 @@ will be the same as:
but will even work for perl-5.6 for chars in the 128 .. 255 range.
Note: Javascript has a function called escape() that produce the
Note: JavaScript has a function called escape() that produces the
sequence "%uXXXX" for chars in the 256 .. 65535 range. This function
has really nothing to do with URI escaping but some folks got confused
since it "does the right thing" in the 0 .. 255 range. Because of
this you sometimes see "URIs" with these kind of escapes. The
JavaScript encodeURI() function is similar to uri_escape_utf8().
JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
=item uri_unescape($string,...)
Expand Down Expand Up @@ -145,7 +146,7 @@ require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
@EXPORT_OK = qw(%escapes);
$VERSION = "3.29";
$VERSION = "3.30";

use Carp ();

Expand All @@ -154,7 +155,12 @@ for (0..255) {
$escapes{chr($_)} = sprintf("%%%02X", $_);
}

my %subst; # compiled patternes
my %subst; # compiled patterns

my %Unsafe = (
RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
);

sub uri_escape
{
Expand All @@ -169,8 +175,7 @@ sub uri_escape
}
&{$subst{$patn}}($text);
} else {
# Default unsafe characters. RFC 2732 ^(uric - reserved)
$text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge;
$text =~ s/($Unsafe{RFC3986})/$escapes{$1} || _fail_hi($1)/ge;
}
$text;
}
Expand Down
Loading

0 comments on commit 58af58c

Please sign in to comment.