#!/home/grinnz/projects/cpandoc-browser/perls/5.42.0/bin/perl -w # ---------------------------------------------------------------------- # urifind - find URIs in a document and dump them to STDOUT. # Copyright (C) 2003 darren chamberlain # ---------------------------------------------------------------------- use strict; our $VERSION = 20160806; use File::Basename qw(basename); use Getopt::Long qw(GetOptions); use IO::File; use URI::Find; use URI::Find::Schemeless; # What to do, and how my $help = 0; my $version = 0; my $sort = 0; my $reverse = 0; my $unique = 0; my $prefix = 0; my $noprefix = 0; my @pats = (); my @schemes = (); my $dump = 0; my $schemeless = 0; Getopt::Long::Configure(qw{no_ignore_case bundling}); GetOptions( 's!' => \$sort, 'u!' => \$unique, 'p!' => \$prefix, 'n!' => \$noprefix, 'r!' => \$reverse, 'h!' => \$help, 'v!' => \$version, 'd!' => sub { $dump = 1 }, 'D!' => sub { $dump = 2 }, 'P=s@' => \@pats, 'S=s@' => \@schemes, 'schemeless!' => \$schemeless, ); if ($help || $version) { my $prog = basename($0); if ($help) { print < 1) { my $prog = basename $0; die "Can't specify -p and -n at the same time; try $prog -h\n"; } # Print filename with matches? -p / -n # If there is more than one file, then show filenames by # default, unless explicitly asked not to (-n) if (@ARGV > 1) { $prefix = 1 unless $noprefix; } else { $prefix = 0 unless $prefix; } # Add schemes to the list of regexen if (@schemes) { unshift @pats => sprintf '^(\b%s\b):' => join '\b|\b' => @schemes; } # If we are dumping (-d, -D), then dump. Exit if -D. if ($dump) { print STDERR "\$scheme = '" . (defined $pats[0] ? $pats[0] : '') . "'\n"; print STDERR "\@pats = ('" . join("', '", @pats) . "')\n"; exit if $dump == 2; } # Find the URIs for my $argv (@ARGV) { my ($name, $fh, $data); $argv = \*STDIN if ($argv eq '-'); if (ref $argv eq 'GLOB') { local $/; $data = <$argv>; $name = '' } else { local $/; $fh = IO::File->new($argv) or die "Can't open $argv: $!"; $data = <$fh>; $name = $argv; } my $class = $schemeless ? "URI::Find::Schemeless" : "URI::Find"; my $finder = $class->new(sub { push @uris => [ $name, $_[0] ] }); $finder->find(\$data); } # Apply patterns, in @pats for my $pat (@pats) { @uris = grep { $_->[1] =~ /$pat/ } @uris; } # Remove redundant links if ($unique) { my %unique; @uris = grep { ++$unique{$_->[1]} == 1 } @uris; } # Sort links, possibly in reverse if ($sort || $reverse) { if ($reverse) { @uris = sort { $b->[1] cmp $a->[1] } @uris; } else { @uris = sort { $a->[1] cmp $b->[1] } @uris; } } # Flatten the arrayrefs if ($prefix) { @uris = map { join ': ' => @$_ } @uris; } else { @uris = map { $_->[1] } @uris; } print map { "$_\n" } @uris; exit 0; __END__ =head1 NAME urifind - find URIs in a document and dump them to STDOUT. =head1 SYNOPSIS $ urifind file =head1 DESCRIPTION F is a simple script that finds URIs in one or more files (using C), and outputs them to to STDOUT. That's it. To find all the URIs in F, use: $ urifind file1 To find the URIs in multiple files, simply list them as arguments: $ urifind file1 file2 file3 F will read from C if no files are given or if a filename of C<-> is specified: $ wget http://www.boston.com/ -O - | urifind When multiple files are listed, F prefixes each found URI with the file from which it came: $ urifind file1 file2 file1: http://www.boston.com/index.html file2: http://use.perl.org/ This can be turned on for single files with the C<-p> ("prefix") switch: $urifind -p file3 file1: http://fsck.com/rt/ It can also be turned off for multiple files with the C<-n> ("no prefix") switch: $ urifind -n file1 file2 http://www.boston.com/index.html http://use.perl.org/ By default, URIs will be displayed in the order found; to sort them ascii-betically, use the C<-s> ("sort") option. To reverse sort them, use the C<-r> ("reverse") flag (C<-r> implies C<-s>). $ urifind -s file1 file2 http://use.perl.org/ http://www.boston.com/index.html mailto:webmaster@boston.com $ urifind -r file1 file2 mailto:webmaster@boston.com http://www.boston.com/index.html http://use.perl.org/ Finally, F supports limiting the returned URIs by scheme or by arbitrary pattern, using the C<-S> option (for schemes) and the C<-P> option. Both C<-S> and C<-P> can be specified multiple times: $ urifind -S mailto file1 mailto:webmaster@boston.com $ urifind -S mailto -S http file1 mailto:webmaster@boston.com http://www.boston.com/index.html C<-P> takes an arbitrary Perl regex. It might need to be protected from the shell: $ urifind -P 's?html?' file1 http://www.boston.com/index.html $ urifind -P '\.org\b' -S http file4 http://www.gnu.org/software/wget/wget.html Add a C<-d> to have F dump the refexen generated from C<-S> and C<-P> to C. C<-D> does the same but exits immediately: $ urifind -P '\.org\b' -S http -D $scheme = '^(\bhttp\b):' @pats = ('^(\bhttp\b):', '\.org\b') To remove duplicates from the results, use the C<-u> ("unique") switch. =head1 OPTION SUMMARY =over 4 =item -s Sort results. =item -r Reverse sort results (implies -s). =item -u Return unique results only. =item -n Don't include filename in output. =item -p Include filename in output (0 by default, but 1 if multiple files are included on the command line). =item -P $re Print only lines matching regex '$re' (may be specified multiple times). =item -S $scheme Only this scheme (may be specified multiple times). =item -h Help summary. =item -v Display version and exit. =item -d Dump compiled regexes for C<-S> and C<-P> to C. =item -D Same as C<-d>, but exit after dumping. =back =head1 AUTHOR darren chamberlain Edarren@cpan.orgE =head1 COPYRIGHT (C) 2003 darren chamberlain This library is free software; you may distribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L