#--------------------------------------------------------------------- package IO::HTML; # # Copyright 2020 Christopher J. Madsen # # Author: Christopher J. Madsen # Created: 14 Jan 2012 # # This program is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the # GNU General Public License or the Artistic License for more details. # # ABSTRACT: Open an HTML file with automatic charset detection #--------------------------------------------------------------------- use 5.008; use strict; use warnings; use Carp 'croak'; use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding use Exporter 5.57 'import'; our $VERSION = '1.004'; # This file is part of IO-HTML 1.004 (September 26, 2020) our $bytes_to_check ||= 1024; our $default_encoding ||= 'cp1252'; our @EXPORT = qw(html_file); our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile sniff_encoding); our %EXPORT_TAGS = ( rw => [qw( html_file html_file_and_encoding html_outfile )], all => [ @EXPORT, @EXPORT_OK ], ); #===================================================================== sub html_file { (&html_file_and_encoding)[0]; # return just the filehandle } # end html_file # Note: I made html_file and html_file_and_encoding separate functions # (instead of making html_file context-sensitive) because I wanted to # use html_file in function calls (i.e. list context) without having # to write "scalar html_file" all the time. sub html_file_and_encoding { my ($filename, $options) = @_; $options ||= {}; open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!"; my ($encoding, $bom) = sniff_encoding($in, $filename, $options); if (not defined $encoding) { croak "No default encoding specified" unless defined($encoding = $default_encoding); $encoding = find_encoding($encoding) if $options->{encoding}; } # end if we didn't find an encoding binmode $in, sprintf(":encoding(%s):crlf", $options->{encoding} ? $encoding->name : $encoding); return ($in, $encoding, $bom); } # end html_file_and_encoding #--------------------------------------------------------------------- sub html_outfile { my ($filename, $encoding, $bom) = @_; if (not defined $encoding) { croak "No default encoding specified" unless defined($encoding = $default_encoding); } # end if we didn't find an encoding elsif (ref $encoding) { $encoding = $encoding->name; } open(my $out, ">:encoding($encoding)", $filename) or croak "Failed to open $filename: $!"; print $out "\x{FeFF}" if $bom; return $out; } # end html_outfile #--------------------------------------------------------------------- sub sniff_encoding { my ($in, $filename, $options) = @_; $filename = 'file' unless defined $filename; $options ||= {}; my $pos = tell $in; croak "Could not seek $filename: $!" if $pos < 0; croak "Could not read $filename: $!" unless defined read $in, my($buf), $bytes_to_check; seek $in, $pos, 0 or croak "Could not seek $filename: $!"; # Check for BOM: my $bom; my $encoding = do { if ($buf =~ /^\xFe\xFF/) { $bom = 2; 'UTF-16BE'; } elsif ($buf =~ /^\xFF\xFe/) { $bom = 2; 'UTF-16LE'; } elsif ($buf =~ /^\xEF\xBB\xBF/) { $bom = 3; 'utf-8-strict'; } else { find_charset_in($buf, $options); # check for } }; # end $encoding if ($bom) { seek $in, $bom, 1 or croak "Could not seek $filename: $!"; $bom = 1; } elsif (not defined $encoding) { # try decoding as UTF-8 my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET); if ($buf =~ /^(?: # nothing left over | [\xC2-\xDF] # incomplete 2-byte char | [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char )\z/x and $test =~ /[^\x00-\x7F]/) { $encoding = 'utf-8-strict'; } # end if valid UTF-8 with at least one multi-byte character: } # end if testing for UTF-8 if (defined $encoding and $options->{encoding} and not ref $encoding) { $encoding = find_encoding($encoding); } # end if $encoding is a string and we want an object return wantarray ? ($encoding, $bom) : $encoding; } # end sniff_encoding #===================================================================== # Based on HTML5 8.2.2.2 Determining the character encoding: # Get attribute from current position of $_ sub _get_attribute { m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or / return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc; my ($name, $value) = (lc $1, ''); if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc) { if (/\G"/gc) { # Double-quoted attribute value /\G([^"]*)("?)/gc; return unless $2; # Incomplete attribute (missing closing quote) $value = lc $1; } elsif (/\G'/gc) { # Single-quoted attribute value /\G([^']*)('?)/gc; return unless $2; # Incomplete attribute (missing closing quote) $value = lc $1; } else { # Unquoted attribute value /\G([^\x09\x0A\x0C\x0D >]*)/gc; $value = lc $1; } } # end if attribute has value return wantarray ? ($name, $value) : 1; } # end _get_attribute # Examine a meta value for a charset: sub _get_charset_from_meta { for (shift) { while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) { return $1 if (/\G"([^"]*)"/gc or /\G'([^']*)'/gc or /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc); } } # end for value return undef; } # end _get_charset_from_meta #--------------------------------------------------------------------- sub find_charset_in { for (shift) { my $options = shift || {}; # search only the first $bytes_to_check bytes (default 1024) my $stop = length > $bytes_to_check ? $bytes_to_check : length; my $expect_pragma = (defined $options->{need_pragma} ? $options->{need_pragma} : 1); pos() = 0; while (pos() < $stop) { if (/\G