=encoding utf8 =head1 NAME IRI - Internationalized Resource Identifiers =head1 VERSION This document describes IRI version 0.013 =head1 SYNOPSIS use IRI; my $i = IRI->new(value => 'https://example.org:80/index#frag'); say $i->scheme; # 'https' say $i->path; # '/index' my $base = IRI->new(value => "http://www.hestebedg\x{e5}rd.dk/"); my $i = IRI->new(value => '#frag', base => $base); say $i->abs; # 'http://www.hestebedgÄrd.dk/#frag' # Defer parsing of the IRI until necessary my $i = IRI->new(value => "http://www.hestebedg\x{e5}rd.dk/", lazy => 1); say $i->path; # path is parsed here =head1 DESCRIPTION The IRI module provides an object representation for Internationalized Resource Identifiers (IRIs) as defined by L and supports their parsing, serializing, and base resolution. =head1 ATTRIBUTES =over 4 =item C<< lazy >> A boolean value indicating whether the IRI should be parsed (and validated) during object construction (false), or parsed only when an IRI component is accessed (true). If no components are ever needed (e.g. an IRI is constructed with a C<< value >> and C<< value >> is the only accessor ever called), no parsing will take place. =back =head1 METHODS =over 4 =item C<< as_string >> Returns the absolute IRI string resolved against the base IRI, if present; the relative IRI string otherwise. =item C<< abs >> Returns the absolute IRI string (resolved against the base IRI if present). =item C<< scheme >> =item C<< host >> =item C<< port >> =item C<< user >> =item C<< path >> =item C<< fragment >> =item C<< query >> Returns the respective component of the parsed IRI. =cut { package IRI; use v5.10.1; use warnings; our $VERSION = '0.013'; use Moo; use MooX::HandlesVia; use Types::Standard qw(Str InstanceOf HashRef Bool); use Scalar::Util qw(blessed); # class_type 'URI'; # coerce 'IRI' => from 'Str' => via { IRI->new( value => $_ ) }; # coerce 'IRI' => from 'URI' => via { IRI->new( value => $_->as_string ) }; has 'lazy' => (is => 'ro', isa => Bool, default => 0); has '_initialized' => (is => 'rw', isa => Bool, default => 0, init_arg => undef); has 'base' => (is => 'ro', isa => InstanceOf['IRI'], predicate => 'has_base', coerce => sub { my $base = shift; if (blessed($base)) { if ($base->isa('IRI')) { return $base; } elsif ($base->isa('URI')) { return IRI->new( value => $base->as_string ); } } else { return IRI->new($base); } }); has 'value' => (is => 'ro', isa => Str, default => ''); has 'components' => (is => 'ro', writer => '_set_components'); has 'abs' => (is => 'ro', lazy => 1, builder => '_abs'); has 'resolved_components' => ( is => 'ro', isa => HashRef, lazy => 1, builder => '_resolved_components', predicate => 1, handles_via => 'Hash', handles => { authority => [ accessor => 'authority' ], scheme => [ accessor => 'scheme' ], host => [ accessor => 'host' ], port => [ accessor => 'port' ], user => [ accessor => 'user' ], path => [ accessor => 'path' ], fragment => [ accessor => 'fragment' ], query => [ accessor => 'query' ], }, ); around BUILDARGS => sub { my $orig = shift; my $class = shift; if (scalar(@_) == 1) { return $class->$orig(value => shift); } return $class->$orig(@_); }; sub BUILD { my $self = shift; if ($self->has_resolved_components) { $self->_set_components($self->resolved_components); $self->_initialized(1); } else { unless ($self->lazy) { my $comp = $self->_parse_components($self->value); } } } before [qw(components as_string abs resolved_components scheme host port user path fragment query)] => sub { my $self = shift; if (not $self->_initialized) { # warn "Lazily initializing IRI"; my $comp = $self->_parse_components($self->value); } }; # These regexes are (mostly) from the syntax grammar in RFC 3987 my $HEXDIG = qr<[0-9A-F]>o; my $ALPHA = qr<[A-Za-z]>o; my $subdelims = qr<[!\$&'()*+,;=]>xo; my $gendelims = qr<[":/?#@] | \[ | \]>xo; my $reserved = qr<${gendelims} | ${subdelims}>o; my $unreserved = qr<${ALPHA} | [0-9] | [-._~]>xo; my $pctencoded = qr<%[0-9A-Fa-f]{2}>o; my $decoctet = qr< [0-9] # 0-9 | [1-9][0-9] # 10-99 | 1 [0-9]{2} # 100-199 | 2 [0-4] [0-9] # 200-249 | 25 [0-5] # 250-255 >xo; my $IPv4address = qr< # IPv4address ${decoctet}[.]${decoctet}[.]${decoctet}[.]${decoctet} >xo; my $h16 = qr<${HEXDIG}{1,4}>o; my $ls32 = qr< ( ${h16} : ${h16} ) | ${IPv4address} >xo; my $IPv6address = qr< # IPv6address ( ( ${h16} : ){6} ${ls32}) | ( :: ( ${h16} : ){5} ${ls32}) | (( ${h16} )? :: ( ${h16} : ){4} ${ls32}) | (( ( ${h16} : ){0,1} ${h16} )? :: ( ${h16} : ){3} ${ls32}) | (( ( ${h16} : ){0,2} ${h16} )? :: ( ${h16} : ){2} ${ls32}) | (( ( ${h16} : ){0,3} ${h16} )? :: ${h16} : ${ls32}) | (( ( ${h16} : ){0,4} ${h16} )? :: ${ls32}) | (( ( ${h16} : ){0,5} ${h16} )? :: ${h16}) | (( ( ${h16} : ){0,6} ${h16} )? ::) >xo; my $IPvFuture = qrxo; my $IPliteral = qr<\[ # IPliteral (${IPv6address} | ${IPvFuture}) \] >xo; my $port = qr<(?[0-9]*)>o; my $scheme = qr<(?${ALPHA} ( ${ALPHA} | [0-9] | [+] | [-] | [.] )*)>xo; my $iprivate = qr<[\x{E000}-\x{F8FF}] | [\x{F0000}-\x{FFFFD}] | [\x{100000}-\x{10FFFD}]>xo; my $ucschar = qr< [\x{a0}-\x{d7ff}] | [\x{f900}-\x{fdcf}] | [\x{fdf0}-\x{ffef}] | [\x{10000}-\x{1FFFD}] | [\x{20000}-\x{2FFFD}] | [\x{30000}-\x{3FFFD}] | [\x{40000}-\x{4FFFD}] | [\x{50000}-\x{5FFFD}] | [\x{60000}-\x{6FFFD}] | [\x{70000}-\x{7FFFD}] | [\x{80000}-\x{8FFFD}] | [\x{90000}-\x{9FFFD}] | [\x{A0000}-\x{AFFFD}] | [\x{B0000}-\x{BFFFD}] | [\x{C0000}-\x{CFFFD}] | [\x{D0000}-\x{DFFFD}] | [\x{E1000}-\x{EFFFD}] >xo; my $iunreserved = qr<${ALPHA}|[0-9]|[-._~]|${ucschar}>o; my $ipchar = qr<(${iunreserved})|(${pctencoded})|(${subdelims})|:|@>o; my $ifragment = qr<(?(${ipchar}|/|[?])*)>o; my $iquery = qr<(?(${ipchar}|${iprivate}|/|[?])*)>o; my $isegmentnznc = qr<(${iunreserved}|${pctencoded}|${subdelims}|@)+ # non-zero-length segment without any colon ":" >xo; my $isegmentnz = qr<${ipchar}+>o; my $isegment = qr<${ipchar}*>o; my $ipathempty = qr<>o; my $ipathrootless = qr<(?${isegmentnz}(/${isegment})*)>o; my $ipathnoscheme = qr<(?${isegmentnznc}(/${isegment})*)>o; my $ipathabsolute = qr<(?/(${isegmentnz}(/${isegment})*)?)>o; my $ipathabempty = qr<(?(/${isegment})*)>o; my $ipath = qr< ${ipathabempty} # begins with "/" or is empty | ${ipathabsolute} # begins with "/" but not "//" | ${ipathnoscheme} # begins with a non-colon segment | ${ipathrootless} # begins with a segment | ${ipathempty} # zero characters >xo; my $iregname = qr<(${iunreserved}|${pctencoded}|${subdelims})*>o; my $ihost = qr<(?${IPliteral}|${IPv4address}|${iregname})>o; my $iuserinfo = qr<(?(${iunreserved}|${pctencoded}|${subdelims}|:)*)>o; my $iauthority = qr<(?(${iuserinfo}@)?${ihost}(:${port})?)>o; my $irelativepart = qr< (//${iauthority}${ipathabempty}) | ${ipathabsolute} | ${ipathnoscheme} | ${ipathempty} >xo; my $irelativeref = qr<${irelativepart}([?]${iquery})?(#${ifragment})?>o; my $ihierpart = qr<(//${iauthority}${ipathabempty})|(${ipathabsolute})|(${ipathrootless})|(${ipathempty})>o; my $absoluteIRI = qr<${scheme}:${ihierpart}([?]${iquery})?>o; my $IRI = qr<${scheme}:${ihierpart}([?]${iquery})?(#${ifragment})?>o; my $IRIreference = qr<${IRI}|${irelativeref}>o; sub _parse_components { my $self = shift; my $v = shift; my $c; if ($v =~ /\A(${IRIreference})\Z/mso and length($1) == length($v)) { %$c = %+; } else { use Data::Dumper; die "Not a valid IRI? " . Dumper($v); } $c->{path} //= ''; $self->_set_components($c); $self->_initialized(1); } sub _merge { my $self = shift; my $base = shift; my $bc = $base->components; my $c = $self->components; my $base_has_authority = ($bc->{user} or $bc->{port} or defined($bc->{host})); if ($base_has_authority and not($bc->{path})) { return "/" . $c->{path}; } else { my $bp = $bc->{path}; my @pathParts = split('/', $bp, -1); # -1 limit means $path='/' splits into ('', '') pop(@pathParts); push(@pathParts, $c->{path}); my $path = join('/', @pathParts); return $path; } } sub _remove_dot_segments { my $self = shift; my $input = shift; my @output; while (length($input)) { if ($input =~ m<^[.][.]/>) { substr($input, 0, 3) = ''; } elsif ($input =~ m<^[.]/>) { substr($input, 0, 2) = ''; } elsif ($input =~ m<^/[.]/>) { substr($input, 0, 3) = '/'; } elsif ($input eq '/.') { $input = '/'; } elsif ($input =~ m<^/[.][.]/>) { substr($input, 0, 4) = '/'; pop(@output); } elsif ($input eq '/..') { $input = '/'; pop(@output); } elsif ($input eq '.') { $input = ''; } elsif ($input eq '..') { $input = ''; } else { my $leadingSlash = ($input =~ m<^/>); if ($leadingSlash) { substr($input, 0, 1) = ''; } my ($part, @parts) = split('/', $input, -1); $part //= ''; if (scalar(@parts)) { unshift(@parts, ''); } $input = join('/', @parts); if ($leadingSlash) { $part = "/$part"; } push(@output, $part); } } my $newPath = join('', @output); return $newPath; } sub _resolved_components { my $self = shift; my $value = $self->value; if ($self->has_base and not($self->components->{scheme})) { # Resolve IRI relative to the base IRI my $base = $self->base; my $v = $self->value; my $bv = $base->value; # warn "resolving IRI <$v> relative to the base IRI <$bv>"; my %components = %{ $self->components }; my %base = %{ $base->components }; my %target; if ($components{scheme}) { foreach my $k (qw(scheme user port host path query)) { if (exists $components{$k}) { $target{$k} = $components{$k}; } } } else { if ($components{user} or $components{port} or defined($components{host})) { foreach my $k (qw(scheme user port host query)) { if (exists $components{$k}) { $target{$k} = $components{$k}; } } my $path = $components{path}; $target{path} = $self->_remove_dot_segments($path); } else { if ($components{path} eq '') { $target{path} = $base{path}; if ($components{query}) { $target{query} = $components{query}; } else { if ($base{query}) { $target{query} = $base{query}; } } } else { if ($components{path} =~ m<^/>) { my $path = $components{path}; $target{path} = $self->_remove_dot_segments($path); } else { my $path = $self->_merge($base); $target{path} = $self->_remove_dot_segments($path); } if (defined($components{query})) { $target{query} = $components{query}; } } if ($base{user} or $base{port} or defined($base{host})) { foreach my $k (qw(user port host)) { if (exists $base{$k}) { $target{$k} = $base{$k}; } } } } if (defined($base{scheme})) { $target{scheme} = $base{scheme}; } } if (defined($components{fragment})) { $target{fragment} = $components{fragment}; } return \%target; } return $self->components; } sub _abs { my $self = shift; my $value = $self->_string_from_components( $self->resolved_components ); return $value; } =item C<< rel ( $base ) >> Returns a new relative IRI object which, when resolved against the C<< $base >> IRI, is equal to this IRI. =cut sub rel { # based on code in URI my $self = shift; my $base = shift; my $rel = IRI->new(value => $self->abs); if (($base->scheme // '') ne ($rel->scheme // '')) { return IRI->new(value => $rel->abs); } my $scheme = $rel->scheme; my $auth = $rel->authority; my $path = $rel->path; if (!defined($scheme) and !defined($auth)) { return $rel; } my $bscheme = $base->scheme; my $bauth = $base->authority; my $bpath = $base->path; for ($bscheme, $bauth, $auth) { $_ = '' unless defined($_); } if ($scheme eq $bscheme) { $rel->scheme(undef); } unless ($scheme eq $bscheme and $auth eq $bauth) { return IRI->new(value => $rel->_abs); } for ($path, $bpath) { $_ = "/$_" unless m{^/}; } # Make it relative by eliminating: # the scheme, $rel->scheme(undef); # ... and authority $rel->host(undef); $rel->port(undef); $rel->user(undef); my @rparts = split('/', $path); my @bparts = split('/', $bpath); shift(@rparts); shift(@bparts); if (scalar(@rparts) and (scalar(@bparts) and $rparts[0] ne $bparts[0])) { # use an absolute path, because $rel differs from $base at the very beginning } else { # This loop is based on code from Nicolai Langfeldt . # First we calculate common initial path components length ($li). my $li = 1; while (1) { my $i = index($path, '/', $li); last if $i < 0 || $i != index($bpath, '/', $li) || substr($path,$li,$i-$li) ne substr($bpath,$li,$i-$li); $li=$i+1; } # then we nuke it from both paths substr($path, 0,$li) = ''; substr($bpath,0,$li) = ''; if ($path eq $bpath) { $rel->path(''); if (defined($rel->query) and defined($base->query)) { if ($rel->query eq $base->query) { $rel->query(undef); } else { # } } elsif (defined($rel->query)) { # } elsif (defined($base->query)) { $rel->path($path); } else { # } } else { # Add one "../" for each path component left in the base path $path = ('../' x $bpath =~ tr|/|/|) . $path; $path = "./" if $path eq ''; $rel->path($path); } } return IRI->new(value => $rel->_abs); } sub as_string { my $self = shift; if ($self->has_base || $self->has_resolved_components) { return $self->abs; } else { return $self->value; } } sub _string_from_components { my $self = shift; my $components = shift; my $iri = ""; if (my $s = $components->{scheme}) { $iri .= "${s}:"; } if ($components->{user} or $components->{port} or defined($components->{host})) { # has authority $iri .= "//"; if (my $u = $components->{user}) { $iri .= sprintf('%s@', $u); } if (defined(my $h = $components->{host})) { $iri .= $h // ''; } if (my $p = $components->{port}) { $iri .= ":$p"; } } if (defined(my $p = $components->{path})) { $iri .= $p; } if (defined(my $q = $components->{query})) { $iri .= '?' . $q; } if (defined(my $f = $components->{fragment})) { $iri .= '#' . $f; } return $iri; } sub _encode { my $str = shift; $str =~ s~([%])~'%' . sprintf('%02x', ord($1))~ge; # gen-delims $str =~ s~([/:?#@]|\[|\])~'%' . sprintf('%02x', ord($1))~ge; # gen-delims $str =~ s~([$!&'()*+,;=])~'%' . sprintf('%02x', ord($1))~ge; # sub-delims return $str; } sub _unencode { my $str = shift; if (defined($str)) { $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; } return $str; } =item C<< query_form >> Returns a HASH of key-value mappings for the unencoded, parsed query form data. =cut sub query_form { my $self = shift; my $q = $self->query // return; my @pairs = split(/&/, $q); return map { _unencode($_) } map { split(/=/, $_) } @pairs; } =item C<< set_query_param ( $key => $value ) >> sets the respective query form value and returns a new L object. =cut sub set_query_param { my $self = shift; my $q = $self->query // return; my %map = map { _unencode($_) } map { split(/=/, $_) } split(/&/, $q); while (my ($k, $v) = splice(@_, 0, 2)) { $map{$k} = $v; } my %c = %{ $self->components }; my @pairs = map { join('=', (_encode($_), _encode($map{$_}))) } keys %map; warn Dumper(\@pairs); $c{query} = join('&', @pairs); my $v = $self->_string_from_components(\%c); return $self->new( value => $v ); } } 1; __END__ =back =head1 SEE ALSO L =head1 AUTHOR Gregory Todd Williams C<< >> =head1 COPYRIGHT Copyright (c) 2014--2024 Gregory Todd Williams. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut