#!/usr/bin/perl package Spreadsheet::Read; =head1 NAME Spreadsheet::Read - Read the data from a spreadsheet =head1 SYNOPSIS use Spreadsheet::Read; my $book = ReadData ("test.csv", sep => ";"); my $book = ReadData ("test.sxc"); my $book = ReadData ("test.ods"); my $book = ReadData ("test.xls"); my $book = ReadData ("test.xlsx"); my $book = ReadData ("test.xlsm"); my $book = ReadData ("test.gnumeric"); my $book = ReadData ($fh, parser => "xls"); Spreadsheet::Read::add ($book, "sheet.csv"); my $sheet = $book->[1]; # first datasheet my $cell = $book->[1]{A3}; # content of field A3 of sheet 1 my $cell = $book->[1]{cell}[1][3]; # same, unformatted # OO API my $book = Spreadsheet::Read->new ("file.csv"); my $sheet = $book->sheet (1); my $cell = $sheet->cell ("A3"); my $cell = $sheet->cell (1, 3); $book->add ("test.xls"); =cut use 5.008001; use strict; use warnings; our $VERSION = "0.90"; sub Version { $VERSION } use Carp; use Exporter; our @ISA = qw( Exporter ); our @EXPORT = qw( ReadData cell2cr cr2cell ); our @EXPORT_OK = qw( parses rows cellrow row add ); use Encode qw( decode ); use File::Temp qw( ); use Data::Dumper; my @parsers = ( [ csv => "Text::CSV_XS", "0.71" ], [ csv => "Text::CSV_PP", "1.17" ], [ csv => "Text::CSV", "1.17" ], [ ods => "Spreadsheet::ParseODS", "0.26" ], [ ods => "Spreadsheet::ReadSXC", "0.26" ], [ sxc => "Spreadsheet::ParseODS", "0.26" ], [ sxc => "Spreadsheet::ReadSXC", "0.26" ], [ sxc => "Spreadsheet::ReadSXC__BAD", "0.26" ], # For testing [ xls => "Spreadsheet::ParseExcel", "0.34" ], [ xlsx => "Spreadsheet::ParseXLSX", "0.24" ], [ xlsm => "Spreadsheet::ParseXLSX", "0.24" ], [ xlsx => "Spreadsheet::XLSX", "0.13" ], [ xlsx => "Excel::ValueReader::XLSX", "1.13" ], # [ prl => "Spreadsheet::Perl", "" ], [ sc => "Spreadsheet::Read", "0.01" ], [ gnumeric => "Spreadsheet::ReadGnumeric", "0.2" ], [ zzz1 => "Z10::Just::For::Testing", "1.23" ], [ zzz2 => "Z20::Just::For::Testing", "" ], [ zzz3 => "Z30::Just::For::Testing", "1.00" ], # Helper modules [ ios => "IO::Scalar", "" ], [ dmp => "Data::Peek", "" ], ); my %can = ( supports => { map { $_->[1] => $_->[2] } @parsers }); foreach my $p (@parsers) { my $format = $p->[0]; $can{$format} and next; $can{$format} = ""; my $preset = $ENV{"SPREADSHEET_READ_\U$format"} or next; my $min_version = $can{supports}{$preset}; unless ($min_version) { # Catch weirdness like $SPREADSHEET_READ_XLSX = "DBD::Oracle" $can{$format} = "!$preset is not supported for the $format format"; next; } if (eval "local \$_; require $preset" and not $@) { # forcing a parser should still check the version my $ok; my $has = $preset->VERSION; $has =~ s/_[0-9]+$//; # Remove beta-part if ($min_version =~ m/^v([0-9.]+)/) { # clumsy versions my @min = split m/\./ => $1; $has =~ s/^v//; my @has = split m/\./ => $has; $ok = (($has[0] * 1000 + $has[1]) * 1000 + $has[2]) >= (($min[0] * 1000 + $min[1]) * 1000 + $min[2]); } else { # normal versions $ok = $has >= $min_version; } $ok or $preset = "!$preset"; } else { $preset = "!$preset"; } $can{$format} = $preset; } delete $can{supports}; foreach my $p (@parsers) { my ($flag, $mod, $vsn) = @$p; $can{$flag} and next; eval "require $mod; \$vsn and ${mod}->VERSION (\$vsn); \$can{\$flag} = '$mod'" and next; $p->[0] = "! Cannot use $mod version $vsn: $@"; $can{$flag} = $@ =~ m/need to install|can(?:not|'t) locate/i ? 0 # Not found : ""; # Too old } $can{sc} = __PACKAGE__; # SquirrelCalc is built-in sub _def_gas { defined $Spreadsheet::ParseExcel::VERSION && $Spreadsheet::ParseExcel::VERSION < 0.61 and *Spreadsheet::ParseExcel::Workbook::get_active_sheet = sub { undef; }; defined $Spreadsheet::ParseODS::VERSION && $Spreadsheet::ParseODS::VERSION < 0.25 and *Spreadsheet::ParseODS::Workbook::get_active_sheet = sub { undef; }; defined $Excel::ValueReader::XLSX::VERSION && $Excel::ValueReader::XLSX::VERSION < 9.99 and *Excel::ValueReader::XLSX::get_active_sheet = sub { undef; }; } # _def_gas my $debug = 0; my %def_opts = ( rc => 1, cells => 1, attr => 0, clip => undef, # $opt{cells}; strip => 0, pivot => 0, dtfmt => "yyyy-mm-dd", # Format 14 debug => 0, passwd => undef, parser => undef, sep => undef, quote => undef, label => undef, merge => 0, ); my @def_attr = ( type => "text", fgcolor => undef, bgcolor => undef, font => undef, size => undef, format => undef, halign => "left", valign => "top", bold => 0, italic => 0, uline => 0, wrap => 0, merged => 0, hidden => 0, locked => 0, enc => "utf-8", # $ENV{LC_ALL} // $ENV{LANG} // ... formula => undef, ); # Helper functions sub _dump { my ($label, $ref) = @_; if ($can{dmp}) { print STDERR Data::Peek::DDumper ({ $label => $ref }); } else { print STDERR Data::Dumper->Dump ([$ref], [$label]); } my @c = caller (1); print STDERR "<<- $c[1]:$c[2]|$c[3]\n"; } # _dump sub _parser { my $type = shift or return ""; if ($type =~ m/::/ and my @p = grep { $_->[1] eq $type } @parsers) { my $format = $p[0][0]; $ENV{"SPREADSHEET_READ_\U$format"} = $type; eval "local \$_; require $type"; $@ and croak ("Forced backend $type for $format fails to load:\n$@"); $can{$format} = $type; $type = $format; } $type = lc $type; my $ods = $can{ods} ? "ods" : "sxc"; # Aliases and fullnames $type eq "excel" and return "xls"; $type eq "excel2007" and return "xlsx"; $type eq "xlsm" and return "xlsx"; $type eq "oo" and return $ods; # $type eq "sxc" and return $ods; $type eq "openoffice" and return $ods; $type eq "libreoffice" and return $ods; $type eq "perl" and return "prl"; $type eq "scalc" and return "sc"; $type eq "squirrelcalc" and return "sc"; return exists $can{$type} ? $type : ""; } # _parser sub new { my $class = shift; my $r = ReadData (@_); unless ($r) { @_ and return; # new with arguments failed to open resource $r = [{ parsers => [], error => undef, sheets => 0, sheet => { }, }]; } bless $r => $class; } # new sub parsers { ref $_[0] eq __PACKAGE__ and shift; my @c; for (sort { $a->[0] cmp $b->[0] || $a->[1] cmp $b->[1] } grep { $_->[0] !~ m{^(?:dmp|ios|!.*)$} } @parsers) { my ($typ, $mod, $min) = @$_; eval "local \$_; require $mod"; my $vsn = $@ ? "-" : eval { $mod->VERSION }; push @c => { ext => $typ, mod => $mod, min => $min, vsn => $vsn, def => $can{$typ} eq $mod ? "*" : "", }; } @c; } # parsers # Spreadsheet::Read::parses ("csv") or die "Cannot parse CSV" sub parses { ref $_[0] eq __PACKAGE__ and shift; my $type = shift or return sort grep { !m/^(?:dmp|ios)/ && $can{$_} !~ m{^!} } keys %can; $type = _parser ($type) or return 0; if ($can{$type} =~ m/^!\s*(.*)/) { $@ = $1; return 0; } return $can{$type} || 0; } # parses sub sheets { my $ctrl = shift->[0]; wantarray or return $ctrl->{sheets}; my $s = $ctrl->{sheet} or return (); # No labels defined sort { $s->{$a} <=> $s->{$b} } keys %$s; } # sheets # col2label (4) => "D" sub col2label { ref $_[0] eq __PACKAGE__ and shift; my $c = shift; defined $c && $c > 0 or return ""; my $cell = ""; while ($c) { use integer; substr $cell, 0, 0, chr (--$c % 26 + ord "A"); $c /= 26; } $cell; } # col2label # cr2cell (4, 18) => "D18" # No prototype to allow 'cr2cell (@rowcol)' sub cr2cell { ref $_[0] eq __PACKAGE__ and shift; my ($c, $r) = @_; defined $c && defined $r && $c > 0 && $r > 0 or return ""; col2label ($c) . $r; } # cr2cell # cell2cr ("D18") => (4, 18) sub cell2cr { ref $_[0] eq __PACKAGE__ and shift; my ($cc, $r) = (uc ($_[0]||"") =~ m/^([A-Z]+)([0-9]+)$/) or return (0, 0); my $c = 0; while ($cc =~ s/^([A-Z])//) { $c = 26 * $c + 1 + ord ($1) - ord ("A"); } ($c, $r); } # cell2cr # my @row = cellrow ($book->[1], 1); # my @row = $book->cellrow (1, 1); sub cellrow { my $sheet = ref $_[0] eq __PACKAGE__ ? (shift)->[shift] : shift or return; ref $sheet eq "HASH" && exists $sheet->{cell} or return; exists $sheet->{maxcol} && exists $sheet->{maxrow} or return; my $row = shift or return; $row > 0 && $row <= $sheet->{maxrow} or return; my $s = $sheet->{cell}; map { $s->[$_][$row] } 1..$sheet->{maxcol}; } # cellrow # my @row = row ($book->[1], 1); # my @row = $book->row (1, 1); sub row { my $sheet = ref $_[0] eq __PACKAGE__ ? (shift)->[shift] : shift or return; ref $sheet eq "HASH" && exists $sheet->{cell} or return; exists $sheet->{maxcol} && exists $sheet->{maxrow} or return; my $row = shift or return; $row > 0 && $row <= $sheet->{maxrow} or return; map { $sheet->{cr2cell ($_, $row)} } 1..$sheet->{maxcol}; } # row # Convert {cell}'s [column][row] to a [row][column] list # my @rows = rows ($book->[1]); sub rows { my $sheet = ref $_[0] eq __PACKAGE__ ? (shift)->[shift] : shift or return; ref $sheet eq "HASH" && exists $sheet->{cell} or return; exists $sheet->{maxcol} && exists $sheet->{maxrow} or return; my $s = $sheet->{cell}; map { my $r = $_; [ map { $s->[$_][$r] } 1..$sheet->{maxcol} ]; } 1..$sheet->{maxrow}; } # rows sub sheet { my ($book, $sheet) = @_; $book && $sheet or return; my $class = "Spreadsheet::Read::Sheet"; $sheet =~ m/^[0-9]+$/ && $sheet >= 1 && $sheet <= $book->[0]{sheets} and return bless $book->[$sheet] => $class; exists $book->[0]{sheet}{$sheet} and return bless $book->[$book->[0]{sheet}{$sheet}] => $class; foreach my $idx (1 .. $book->[0]{sheets}) { $book->[$idx]{label} eq $sheet and return bless $book->[$idx] => $class; } return; } # sheet # If option "clip" is set, remove the trailing rows and # columns in each sheet that contain no visible data sub _clipsheets { my ($opt, $ref) = @_; unless ($ref->[0]{sheets}) { $ref->{sheet} ||= {}; return $ref; } my ($rc, $cl) = ($opt->{rc}, $opt->{cells}); my ($oc, $os, $oa) = ($opt->{clip}, $opt->{strip}, $opt->{attr}); # Strip leading/trailing spaces if ($os || $oc) { foreach my $sheet (1 .. $ref->[0]{sheets}) { $ref->[$sheet]{indx} = $sheet; my $ss = $ref->[$sheet]; $ss->{maxrow} && $ss->{maxcol} or next; my ($mc, $mr) = (0, 0); foreach my $row (1 .. $ss->{maxrow}) { foreach my $col (1 .. $ss->{maxcol}) { if ($rc) { defined $ss->{cell}[$col][$row] or next; $os & 2 and $ss->{cell}[$col][$row] =~ s/\s+$//; $os & 1 and $ss->{cell}[$col][$row] =~ s/^\s+//; if (length $ss->{cell}[$col][$row]) { $col > $mc and $mc = $col; $row > $mr and $mr = $row; } } if ($cl) { my $cell = cr2cell ($col, $row); defined $ss->{$cell} or next; $os & 2 and $ss->{$cell} =~ s/\s+$//; $os & 1 and $ss->{$cell} =~ s/^\s+//; if (length $ss->{$cell}) { $col > $mc and $mc = $col; $row > $mr and $mr = $row; } } } } $oc && ($mc < $ss->{maxcol} || $mr < $ss->{maxrow}) or next; # Remove trailing empty columns foreach my $col (($mc + 1) .. $ss->{maxcol}) { $rc and undef $ss->{cell}[$col]; $oa and undef $ss->{attr}[$col]; $cl or next; my $c = col2label ($col); delete $ss->{"$c$_"} for 1 .. $ss->{maxrow}; } # Remove trailing empty rows foreach my $row (($mr + 1) .. $ss->{maxrow}) { foreach my $col (1 .. $mc) { $cl and delete $ss->{cr2cell ($col, $row)}; $rc and undef $ss->{cell} [$col][$row]; $oa and undef $ss->{attr} [$col][$row]; } } ($ss->{maxrow}, $ss->{maxcol}) = ($mr, $mc); } } if ($opt->{pivot}) { foreach my $sheet (1 .. $ref->[0]{sheets}) { my $ss = $ref->[$sheet]; $ss->{maxrow} || $ss->{maxcol} or next; my $mx = $ss->{maxrow} > $ss->{maxcol} ? $ss->{maxrow} : $ss->{maxcol}; foreach my $row (2 .. $mx) { foreach my $col (1 .. ($row - 1)) { $opt->{rc} and ($ss->{cell}[$col][$row], $ss->{cell}[$row][$col]) = ($ss->{cell}[$row][$col], $ss->{cell}[$col][$row]); $opt->{cells} and ($ss->{cr2cell ($col, $row)}, $ss->{cr2cell ($row, $col)}) = ($ss->{cr2cell ($row, $col)}, $ss->{cr2cell ($col, $row)}); } } ($ss->{maxcol}, $ss->{maxrow}) = ($ss->{maxrow}, $ss->{maxcol}); } } $ref; } # _clipsheets # Convert a single color (index) to a color sub _xls_color { my $clr = shift; defined $clr or return undef; $clr eq "#000000" and return undef; $clr =~ m/^#[0-9a-fA-F]+$/ and return lc $clr; $clr == 0 || $clr == 32767 and return undef; # Default fg color return "#" . lc Spreadsheet::ParseExcel->ColorIdxToRGB ($clr); } # _xls_color # Convert a fill [ $pattern, $front_color, $back_color ] to a single background sub _xls_fill { my ($p, $fg, $bg) = @_; defined $p or return undef; $p == 32767 and return undef; # Default fg color $p == 0 && !defined $bg and return undef; # No fill bg color $p == 1 and return _xls_color ($fg); $bg < 8 || $bg > 63 and return undef; # see Workbook.pm#106 return _xls_color ($bg); } # _xls_fill sub _missing_parser { my ($type, $suggest) = (shift, ""); foreach my $p (@parsers) { $p->[0] eq lc $type or next; $suggest = "\nPlease install $p->[1]"; } "No parser for $type found$suggest\n"; } # _missing_parser sub _txt_is_xml { # Return true if $txt is gzipped or contains XML. If we are also passed # $ns_uri_of_interest, $txt must contain it in the first 1000 or so # characters (but we try to search quickly rather than precisely). my ($txt, $ns_uri_of_interest) = @_; ref $txt and return; # Can't tell (unless we assume the stream is seekable). if ($txt =~ m/\A\037\213/) { # Literal gzipped string (/usr/share/misc/magic). [this is a hack that # works only because Gnumeric is the only format that uses gzip. -- # rgr, 13-Jan-2023] return 1; } if ($txt =~ m/\A<\?xml/) { $ns_uri_of_interest or return 1; $ns_uri_of_interest =~ s/([^\w\d])/\\$1/g; my $prefix = length ($txt) > 10000 ? substr $txt, 0, 1000 : $txt; return $prefix =~ m/xmlns:\w+=.$ns_uri_of_interest/; } $txt =~ m/\n/ and return; # safeguard for older perl versions open my $in, "<", $txt or return; read $in, my $block, 1000 or return; return _txt_is_xml ($block, $ns_uri_of_interest); } # _txt_is_xml sub ReadData { my $txt = shift or return; my %opt; if (@_) { if (ref $_[0] eq "HASH") { %opt = %{shift @_} } elsif (@_ % 2 == 0) { %opt = @_ } } # Aliasses exists $opt{transpose} && !exists $opt{pivot} and $opt{pivot} = delete $opt{transpose}; exists $opt{trim} && !exists $opt{strip} and $opt{strip} = delete $opt{trim}; exists $opt{rc} or $opt{rc} = $def_opts{rc}; exists $opt{cells} or $opt{cells} = $def_opts{cells}; exists $opt{attr} or $opt{attr} = $def_opts{attr}; exists $opt{clip} or $opt{clip} = $opt{cells}; exists $opt{strip} or $opt{strip} = $def_opts{strip}; exists $opt{dtfmt} or $opt{dtfmt} = $def_opts{dtfmt}; exists $opt{merge} or $opt{merge} = $def_opts{merge}; # $debug = $opt{debug} || 0; $debug = defined $opt{debug} ? $opt{debug} : $def_opts{debug}; $debug > 4 and _dump (Options => \%opt); my %parser_opts = map { $_ => $opt{$_} } grep { !exists $def_opts{$_} } keys %opt; my $_parser = _parser ($opt{parser}); my $io_ref = ref ($txt) =~ m/GLOB|IO/ ? $txt : undef; my $io_fil = $io_ref ? 0 : $txt =~ m/\0/ ? 0 : do { no warnings "newline"; -f $txt }; my $io_txt = $io_ref || $io_fil ? 0 : 1; $io_fil && ! -s $txt and do { $@ = "$txt is empty"; return }; $io_ref && eof $txt and do { $@ = "Empty stream"; return }; if ($opt{parser} ? $_parser eq "csv" : ($io_fil && $txt =~ m/\.(csv)$/i)) { $can{csv} or croak _missing_parser ("CSV"); my $label = defined $opt{label} ? $opt{label} : $io_fil ? $txt : "IO"; $debug and print STDERR "Opening CSV $label using $can{csv}-", $can{csv}->VERSION, "\n"; my @data = ( { type => "csv", parser => $can{csv}, version => $can{csv}->VERSION, parsers => [ { type => "csv", parser => $can{csv}, version => $can{csv}->VERSION, }], error => undef, quote => '"', sepchar => ',', sheets => 1, sheet => { $label => 1 }, }, { parser => 0, label => $label, maxrow => 0, maxcol => 0, cell => [], attr => [], merged => [], active => 1, hidden => 0, }, ); my ($sep, $quo, $in) = (",", '"'); defined $opt{sep} and $sep = $opt{sep}; defined $opt{quote} and $quo = $opt{quote}; $debug > 8 and _dump (debug => { data => \@data, txt => $txt, io_ref => $io_ref, io_fil => $io_fil }); if ($io_fil) { unless (defined $opt{quote} && defined $opt{sep}) { open $in, "<", $txt or return; my $l1 = <$in>; $quo = defined $opt{quote} ? $opt{quote} : '"'; $sep = # If explicitly set, use it defined $opt{sep} ? $opt{sep} : # otherwise start auto-detect with quoted strings $l1 =~ m/["0-9];["0-9;]/ ? ";" : $l1 =~ m/["0-9],["0-9,]/ ? "," : $l1 =~ m/["0-9]\t["0-9,]/ ? "\t" : $l1 =~ m/["0-9]\|["0-9,]/ ? "|" : # If neither, then for unquoted strings $l1 =~ m/\w;[\w;]/ ? ";" : $l1 =~ m/\w,[\w,]/ ? "," : $l1 =~ m/\w\t[\w,]/ ? "\t" : $l1 =~ m/\w\|[\w,]/ ? "|" : "," ; close $in; } open $in, "<", $txt or return; } elsif ($io_ref) { $in = $txt; } elsif (ref $txt eq "SCALAR") { open $in, "<", $txt or croak "Cannot open input: $!"; } elsif ($txt =~ m/[\r\n,;]/) { open $in, "<", \$txt or croak "Cannot open input: $!"; } else { warn "Input type ", ref $txt, " might not be supported. Please file a ticket\n"; $in = $txt; # Now pray ... } $debug > 1 and print STDERR "CSV sep_char '$sep', quote_char '$quo'\n"; my $csv = $can{csv}->new ({ %parser_opts, sep_char => ($data[0]{sepchar} = $sep), quote_char => ($data[0]{quote} = $quo), keep_meta_info => 1, binary => 1, auto_diag => 1, }) or croak "Cannot create a csv ('$sep', '$quo') parser!"; while (my $row = $csv->getline ($in)) { my @row = @$row or last; my $r = ++$data[1]{maxrow}; @row > $data[1]{maxcol} and $data[1]{maxcol} = @row; foreach my $c (0 .. $#row) { my $val = $row[$c]; my $cell = cr2cell ($c + 1, $r); $opt{rc} and $data[1]{cell}[$c + 1][$r] = $val; $opt{cells} and $data[1]{$cell} = $val; $opt{attr} and $data[1]{attr}[$c + 1][$r] = { @def_attr }; } } $csv->eof () or $data[0]{error} = [ $csv->error_diag ]; close $in; for (@{$data[1]{cell}}) { defined or $_ = []; } return _clipsheets \%opt, [ @data ]; } if ($io_txt) { # && $_parser !~ m/^xlsx?$/) { if ( # /etc/magic: Microsoft Office Document $txt =~ m{\A(\376\067\0\043 |\320\317\021\340\241\261\032\341 |\333\245-\0\0\0)}x # /usr/share/misc/magic || $txt =~ m{\A.{2080}Microsoft Excel 5.0 Worksheet} || $txt =~ m{\A\x09\x04\x06\x00\x00\x00\x10\x00} ) { $can{xls} or croak _missing_parser ("XLS"); my $tmpfile; if ($can{ios}) { # Do not use a temp file if IO::Scalar is available $tmpfile = \$txt; } else { $tmpfile = File::Temp->new (SUFFIX => ".xls", UNLINK => 1); binmode $tmpfile; print $tmpfile $txt; close $tmpfile; } open $io_ref, "<", $tmpfile or do { $@ = $!; return }; $io_txt = 0; $_parser = _parser ($opt{parser} = "xls"); } elsif ( # /usr/share/misc/magic $txt =~ m{\APK\003\004.{4,30}(?:\[Content_Types\]\.xml|_rels/\.rels)} ) { $can{xlsx} or croak _missing_parser ("XLSX"); my $tmpfile; if ($can{ios}) { # Do not use a temp file if IO::Scalar is available $tmpfile = \$txt; } else { $tmpfile = File::Temp->new (SUFFIX => ".xlsx", UNLINK => 1); binmode $tmpfile; print $tmpfile $txt; close $tmpfile; } open $io_ref, "<", $tmpfile or do { $@ = $!; return }; $io_txt = 0; $_parser = _parser ($opt{parser} = "xlsx"); } elsif ( # /usr/share/misc/magic $txt =~ m{\APK\003\004.{9,30}\Qmimetypeapplication/vnd.oasis.opendocument.spreadsheet} ) { $can{ods} or croak _missing_parser ("ODS"); my $tmpfile; if ($can{ios}) { # Do not use a temp file if IO::Scalar is available $tmpfile = \$txt; } else { $tmpfile = File::Temp->new (SUFFIX => ".ods", UNLINK => 1); binmode $tmpfile; print $tmpfile $txt; close $tmpfile; } open $io_ref, "<", $tmpfile or do { $@ = $!; return }; $io_txt = 0; $_parser = _parser ($opt{parser} = "ods"); } elsif (!$io_ref && $txt =~ m/\.xls[xm]?$/i) { $@ = "Cannot open $txt as file"; return; } } if ($opt{parser} ? $_parser =~ m/^(?:xlsx?)$/ : ($io_fil && $txt =~ m/\.(xls[xm]?)$/i && ($_parser = _parser ($1)))) { my $parse_type = $_parser =~ m/x$/i ? "XLSX" : "XLS"; my $parser = $can{lc $parse_type} or croak _missing_parser ($parse_type); #$debug and print STDERR __FILE__, "#", __LINE__, " | $_parser | $parser | $parse_type\n"; $debug and print STDERR "Opening $parse_type ", $io_ref ? "" : $txt, " using $parser-", $can{lc $parse_type}->VERSION, "\n"; $opt{passwd} and $parser_opts{Password} = $opt{passwd}; my $oBook = eval { $io_ref ? $parse_type eq "XLSX" ? $can{xlsx} eq "Spreadsheet::XLSX" ? $parser->new ($io_ref) # Spreadsheet::XLXS ($io) : $can{xlsx} eq "Excel::ValueReader::XLSX" ? $parser->new (xlsx => $io_ref, %parser_opts) # Excel::ValueReader::XLSX ($io / $content) : $parser->new (%parser_opts)->parse ($io_ref) # Spreadsheet::ParseXLSX ($io) : $parser->new (%parser_opts)->Parse ($io_ref) # Spreadsheet::ParseExcel ($io) : $parse_type eq "XLSX" ? $can{xlsx} eq "Spreadsheet::XLSX" ? $parser->new ($txt) # Spreadsheet::XLXS ($file / $content) : $can{xlsx} eq "Excel::ValueReader::XLSX" ? $parser->new (xlsx => $txt, %parser_opts) # Excel::ValueReader::XLSX ($file / $content) : $parser->new (%parser_opts)->parse ($txt) # Spreadsheet::ParseXLSX ($file / $content) : $parser->new (%parser_opts)->Parse ($txt); # Spreadsheet::ParseExcel ($file / $content) }; unless ($oBook) { # cleanup will fail on folders with spaces. (my $msg = $@) =~ s/ at \S+ line \d+.*//s; croak "$parse_type parser cannot parse data: $msg"; } $debug > 8 and _dump (oBook => $oBook); # WorkBook keys: # aColor _CurSheet Format SheetCount # ActiveSheet _CurSheet_ FormatStr _skip_chart # Author File NotSetCell _string_contin # BIFFVersion Flg1904 Object Version # _buffer FmtClass PkgStr Worksheet # CellHandler Font _previous_info my @data = ( { type => lc $parse_type, parser => $can{lc $parse_type}, version => $can{lc $parse_type}->VERSION, parsers => [{ type => lc $parse_type, parser => $can{lc $parse_type}, version => $can{lc $parse_type}->VERSION, }], error => undef, sheets => $oBook->{SheetCount} || 0, sheet => {}, } ); # Overrule the default date format strings my %def_fmt = ( 0x0E => lc $opt{dtfmt}, # m-d-yy 0x0F => "d-mmm-yyyy", # d-mmm-yy 0x11 => "mmm-yyyy", # mmm-yy 0x16 => "yyyy-mm-dd hh:mm", # m-d-yy h:mm ); $oBook->{FormatStr}{$_} = $def_fmt{$_} for keys %def_fmt; my $oFmt = $parse_type eq "XLSX" ? $can{xlsx} eq "Spreadsheet::XLSX" ? Spreadsheet::XLSX::Fmt2007->new : Spreadsheet::ParseExcel::FmtDefault->new : Spreadsheet::ParseExcel::FmtDefault->new; $debug > 20 and _dump ("oBook before conversion", $oBook); if ($can{xlsx} eq "Excel::ValueReader::XLSX" and !exists $oBook->{SheetCount}) { my @sheets = $oBook->sheet_names; $data[0]{sheet} = { map { $sheets[$_] => $_ + 1 } 0 .. $#sheets }; $data[0]{sheets} = scalar @sheets; foreach my $sheet_name (@sheets) { my $grid = $oBook->values ($sheet_name); my $sheet = { label => $sheet_name, minrow => 1, mincol => 1, indx => 1, merged => [], }; # Transpose to column vectors. # The A1, B5 etc items could be added here as well. my @c; foreach my $r (0 .. $#$grid) { my $row = $grid->[$r]; foreach my $c (0 .. $#$row) { # add 1 for array base 1 my $val = $grid->[$r][$c]; my $cell = cr2cell ($c + 1, $r + 1); $opt{rc} and $sheet->{cell}[$c + 1][$r + 1] = $val; $opt{cells} and $sheet->{$cell} = $val; $c[$c] = 1; } } # First entry in @t is padding so number of items # is the max index. $sheet->{maxcol} = scalar @c; # No padding of first entry in $grid so # number of items is the array length. $sheet->{maxrow} = @$grid; push @data => $sheet; } #use DP;die DDumper { oBook => $oBook, oFmt => $oFmt, data => \@data, parser_type => $parse_type }; } if ($parse_type eq "ODS" and !exists $oBook->{SheetCount}) { my $styles = delete $oBook->{_styles}; my $sheets = delete $oBook->{_sheets}; if ($sheets && ref $sheets eq "ARRAY") { $styles = ($styles || {})->{styles} || {}; $data[0]{sheets} = $oBook->{SheetCount} = scalar @{$sheets}; $oBook->{Worksheet} = []; *S::R::Sheet::get_merged_areas = sub { [] }; my $x = 0; foreach my $sh (@{$sheets}) { push @{$oBook->{Worksheet}} => bless { Name => $sh->{label}, Cells => [], MinRow => $sh->{col_min}, MaxRow => $sh->{row_max}, MinCol => $sh->{col_min}, MaxCol => $sh->{row_max}, SheetHidden => $sh->{sheet_hidden} || 0, RowHidden => $sh->{hidden_rows}, ColHidden => $sh->{hidden_cols}, _SheetNo => $x++, } => "S::R::Sheet"; # header_cols # header_rows # print_areas # sheet_hidden # tab_color *S::R::Cell::Value = sub { $_[0]{Raw} }; *S::R::Cell::is_merged = sub { 0 }; my $r = 0; foreach my $row (@{$sh->{data}}) { $#$row > $oBook->{Worksheet}[-1]{MaxCol} and $oBook->{Worksheet}[-1]{MaxCol} = $#$row; $oBook->{Worksheet}[-1]{Cells}[$r++] = [ map { bless { Code => undef, Format => $_->{format}, Formula => $_->{formula}, Hidden => undef, Merged => undef, # use || instead of // for now # even though it is undesirable Type => $_->{type} || "", Val => $_->{value} || $_->{unformatted}, Raw => $_->{unformatted} || $_->{value}, _Style => $styles->{$_->{style} || ""} || $_->{style}, # hyperlink } => "S::R::Cell" } @{$row} ]; } --$r > $oBook->{Worksheet}[-1]{MaxRow} and $oBook->{Worksheet}[-1]{MaxRow} = $r; } } } $debug and print STDERR "\t$data[0]{sheets} sheets\n"; _def_gas (); my $active_sheet = $oBook->get_active_sheet || $oBook->{ActiveSheet} || $oBook->{SelectedSheet}; my $current_sheet = 0; foreach my $oWkS (@{$oBook->{Worksheet}}) { $debug > 8 and _dump ("oWkS", $oWkS); $current_sheet++; $opt{clip} and !defined $oWkS->{Cells} and next; # Skip empty sheets my %sheet = ( parser => 0, label => $oWkS->{Name}, maxrow => 0, maxcol => 0, cell => [], attr => [], merged => [], active => 0, hidden => $oWkS->{SheetHidden} || 0, ); # $debug and $sheet{_parser} = $oWkS; defined $sheet{label} or $sheet{label} = "-- unlabeled --"; exists $oWkS->{MinRow} and $sheet{minrow} = $oWkS->{MinRow} + 1; exists $oWkS->{MaxRow} and $sheet{maxrow} = $oWkS->{MaxRow} + 1; exists $oWkS->{MinCol} and $sheet{mincol} = $oWkS->{MinCol} + 1; exists $oWkS->{MaxCol} and $sheet{maxcol} = $oWkS->{MaxCol} + 1; $sheet{merged} = [ map { $_->[0] } sort { $a->[1] cmp $b->[1] } map {[ $_, pack "NNNN", @$_ ]} map {[ map { $_ + 1 } @{$_}[1,0,3,2] ]} @{$oWkS->get_merged_areas || []}]; my $sheet_idx = 1 + @data; $debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} x $sheet{maxcol}\n"; if (defined $active_sheet) { # _SheetNo is 0-based my $sheet_no = defined $oWkS->{_SheetNo} ? $oWkS->{_SheetNo} : $current_sheet - 1; $sheet_no eq $active_sheet and $sheet{active} = 1; } # Sheet keys: # _Book FooterMargin MinCol RightMargin # BottomMargin FooterMergin MinRow RightMergin # BottomMergin HCenter Name RowHeight # Cells Header NoColor RowHidden # ColFmtNo HeaderMargin NoOrient Scale # ColHidden HeaderMergin NoPls SheetHidden # ColWidth Kind Notes _SheetNo # Copis Landscape PageFit SheetType # DefColWidth LeftMargin PageStart SheetVersion # DefRowHeight LeftMergin PaperSize TopMargin # Draft LeftToRight _Pos TopMergin # FitHeight MaxCol PrintGrid UsePage # FitWidth MaxRow PrintHeaders VCenter # Footer MergedArea Res VRes if (exists $oWkS->{MinRow}) { my $hiddenRows = $oWkS->{RowHidden} || []; my $hiddenCols = $oWkS->{ColHidden} || []; if ($opt{clip}) { my ($mr, $mc) = (-1, -1); foreach my $r ($oWkS->{MinRow} .. $sheet{maxrow}) { foreach my $c ($oWkS->{MinCol} .. $sheet{maxcol}) { my $oWkC = $oWkS->{Cells}[$r][$c] or next; defined (my $val = $oWkC->{Val}) or next; $val eq "" and next; $r > $mr and $mr = $r; $c > $mc and $mc = $c; } } ($sheet{maxrow}, $sheet{maxcol}) = ($mr + 1, $mc + 1); } foreach my $r ($oWkS->{MinRow} .. $sheet{maxrow}) { foreach my $c ($oWkS->{MinCol} .. $sheet{maxcol}) { my $oWkC = $oWkS->{Cells}[$r][$c] or next; #defined (my $val = $oWkC->{Val}) or next; my $val = $oWkC->{Val}; if (defined $val and my $enc = $oWkC->{Code}) { $enc eq "ucs2" and $val = decode ("utf-16be", $val); } my $cell = cr2cell ($c + 1, $r + 1); $opt{rc} and $sheet{cell}[$c + 1][$r + 1] = $val; # Original my $fmt; my $FmT = $oWkC->{Format}; if ($FmT) { unless (ref $FmT) { $fmt = $FmT; $FmT = {}; } } else { $FmT = {}; } foreach my $attr (qw( AlignH AlignV FmtIdx Hidden Lock Wrap )) { exists $FmT->{$attr} or $FmT->{$attr} = 0; } exists $FmT->{Fill} or $FmT->{Fill} = [ 0 ]; exists $FmT->{Font} or $FmT->{Font} = undef; unless (defined $fmt) { $fmt = $FmT->{FmtIdx} ? $oBook->{FormatStr}{$FmT->{FmtIdx}} : undef; } lc $oWkC->{Type} eq "float" and $oWkC->{Type} = "Numeric"; if ($oWkC->{Type} eq "Numeric") { # Fixed in 0.33 and up # see Spreadsheet/ParseExcel/FmtDefault.pm $FmT->{FmtIdx} == 0x0e || $FmT->{FmtIdx} == 0x0f || $FmT->{FmtIdx} == 0x10 || $FmT->{FmtIdx} == 0x11 || $FmT->{FmtIdx} == 0x16 || (defined $fmt && $fmt =~ m{^[dmy][-\\/dmy]*$}) and $oWkC->{Type} = "Date"; $FmT->{FmtIdx} == 0x09 || $FmT->{FmtIdx} == 0x0a || (defined $fmt && $fmt =~ m{^0+\.0+%$}) and $oWkC->{Type} = "Percentage"; } defined $fmt and $fmt =~ s/\\//g; $opt{cells} and # Formatted value $sheet{$cell} = defined $val ? $FmT && exists $def_fmt{$FmT->{FmtIdx}} ? $oFmt->ValFmt ($oWkC, $oBook) : $oWkC->Value : undef; if ($opt{attr}) { my $FnT = $FmT->{Font}; my $fmi = $FmT->{FmtIdx} ? $oBook->{FormatStr}{$FmT->{FmtIdx}} : undef; $fmi and $fmi =~ s/\\//g; my $merged = (defined $oWkC->{Merged} ? $oWkC->{Merged} : $oWkC->is_merged) || 0; $sheet{attr}[$c + 1][$r + 1] = { @def_attr, type => lc $oWkC->{Type}, enc => $oWkC->{Code}, merged => $merged, hidden => ($hiddenRows->[$r] || $hiddenCols->[$c] ? 1 : defined $oWkC->{Hidden} ? $oWkC->{Hidden} : $FmT->{Hidden}) || 0, locked => $FmT->{Lock} || 0, format => $fmi, halign => [ undef, qw( left center right fill justify ), undef, "equal_space" ]->[$FmT->{AlignH}], valign => [ qw( top center bottom justify equal_space )]->[$FmT->{AlignV}], wrap => $FmT->{Wrap}, font => $FnT->{Name}, size => $FnT->{Height}, bold => $FnT->{Bold}, italic => $FnT->{Italic}, uline => $FnT->{Underline}, fgcolor => _xls_color ($FnT->{Color}), bgcolor => _xls_fill (@{$FmT->{Fill}}), formula => $oWkC->{Formula}, }; #_dump "cell", $sheet{attr}[$c + 1][$r + 1]; if ($opt{merge} && $merged and my $p_cell = Spreadsheet::Read::Sheet::merged_from (\%sheet, $c + 1, $r + 1)) { warn $p_cell; $sheet{attr}[$c + 1][$r + 1]{merged} = $p_cell; if ($cell ne $p_cell) { my ($C, $R) = cell2cr ($p_cell); $sheet{cell}[$c + 1][$r + 1] = $sheet{cell}[$C][$R]; $sheet{$cell} = $sheet{$p_cell}; } } } } } } for (@{$sheet{cell}}) { defined or $_ = []; } push @data => { %sheet }; # $data[0]{sheets}++; if ($sheet{label} eq "-- unlabeled --") { $sheet{label} = ""; } else { $data[0]{sheet}{$sheet{label}} = $#data; } } return _clipsheets \%opt, [ @data ]; } if ($opt{parser} ? $_parser =~ m/^(ods)$/ : ($io_fil && $txt =~ m/(ods)$/i && ($_parser = _parser ($1))) and ($can{$_parser} || "") !~ m/sxc/i) { my $parse_type = "ODS"; my $parser = $can{lc $parse_type} or croak _missing_parser ($parse_type); #$debug and print STDERR __FILE__, "#", __LINE__, " | $_parser | $parser | $parse_type\n"; $debug and print STDERR "Opening $parse_type ", $io_ref ? "" : $txt, " using $parser-", $can{lc $parse_type}->VERSION, "\n"; $opt{passwd} and $parser_opts{Password} = $opt{passwd}; my $oBook = eval { $io_ref ? $parser->new (readonly => 1, %parser_opts)->parse ($io_ref) : $parser->new (readonly => 1, %parser_opts)->parse ($txt) }; unless ($oBook) { # cleanup will fail on folders with spaces. (my $msg = $@) =~ s/ at \S+ line \d+.*//s; croak "$parse_type parser cannot parse data: $msg"; } $debug > 8 and _dump (oBook => $oBook); my @data = ( { type => lc $parse_type, parser => $can{lc $parse_type}, version => $can{lc $parse_type}->VERSION, parsers => [{ type => lc $parse_type, parser => $can{lc $parse_type}, version => $can{lc $parse_type}->VERSION, }], error => undef, sheets => scalar $oBook->worksheets, sheet => {}, } ); # $debug and $data[0]{_parser} = $oBook; $debug and print STDERR "\t$data[0]{sheets} sheets\n"; _def_gas (); my $active_sheet = $oBook->get_active_sheet; my $current_sheet = 0; foreach my $oWkS ($oBook->worksheets) { $current_sheet++; $opt{clip} && $oWkS->row_max < $oWkS->row_min && $oWkS->col_max < $oWkS->col_min and next; # Skip empty sheets my %sheet = ( parser => 0, label => $oWkS->label, maxrow => $oWkS->row_max+1, maxcol => $oWkS->col_max+1, cell => [], attr => [], merged => [], active => 0, hidden => 0, ); # $debug and $sheet{_parser} = $oWkS; defined $sheet{label} or $sheet{label} = "-- unlabeled --"; $sheet{merged} = [ map { $_->[0] } sort { $a->[1] cmp $b->[1] } map {[ $_, pack "NNNN", @$_ ]} map {[ map { $_ + 1 } @{$_}[1,0,3,2] ]} @{$oWkS->get_merged_areas || []}]; my $sheet_idx = 1 + @data; $debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} x $sheet{maxcol}\n"; if (defined $active_sheet) { my $sheet_no = $current_sheet - 1; $sheet_no eq $active_sheet and $sheet{active} = 1; } my $hiddenRows = $oWkS->hidden_rows || []; my $hiddenCols = $oWkS->hidden_cols || []; if ($opt{clip}) { my ($mr, $mc) = (-1, -1); foreach my $r ($oWkS->row_min .. $sheet{maxrow}-1) { foreach my $c ($oWkS->col_min .. $sheet{maxcol}-1) { my $oWkC = $oWkS->get_cell($r, $c) or next; defined (my $val = $oWkC->value) or next; $val eq "" and next; $r > $mr and $mr = $r; $c > $mc and $mc = $c; } } ($sheet{maxrow}, $sheet{maxcol}) = ($mr + 1, $mc + 1); } foreach my $r ($oWkS->row_min .. $sheet{maxrow}) { foreach my $c ($oWkS->col_min .. $sheet{maxcol}) { my $oWkC = $oWkS->get_cell($r, $c) or next; my $val = $oWkC->unformatted; #if (defined $val and my $enc = $oWkC->{Code}) { # $enc eq "ucs2" and $val = decode ("utf-16be", $val); # } my $cell = cr2cell ($c + 1, $r + 1); $opt{rc} and $sheet{cell}[$c + 1][$r + 1] = $val; # Original my $fmt; my $styleName = $oWkC->style; my $FmT; if ($styleName && defined (my $s = $oBook->_styles->{$styleName})) { $fmt = $s; } defined $fmt and $fmt =~ s/\\//g; $opt{cells} and # Formatted value $sheet{$cell} = defined $val ? $oWkC->value : undef; if ($opt{attr}) { # my $FnT = $FmT ? $FmT->{font_face} : undef; my $fmi; #my $fmi = $FmT ? $FmT->{FmtIdx} # ? $oBook->{FormatStr}{$FmT->{FmtIdx}} # : undef; #$fmi and $fmi =~ s/\\//g; my $type = $oWkC->type || ''; $type eq "float" and $type = "numeric"; my $merged = $oWkC->is_merged || 0; $sheet{attr}[$c + 1][$r + 1] = { @def_attr, type => $type, # enc => $oWkC->{Code}, merged => $merged, hidden => ($hiddenRows->[$r] || $hiddenCols->[$c] ? 1 : $oWkC->is_hidden ? $oWkC->is_hidden : undef) || 0, # locked => $FmT->{Lock} || 0, format => $fmi, # halign => [ undef, qw( left center right # fill justify ), undef, # "equal_space" ]->[$FmT->{AlignH}], # valign => [ qw( top center bottom justify # equal_space )]->[$FmT->{AlignV}], # wrap => $FmT->{Wrap}, # font => $FnT->{Name}, # size => $FnT->{Height}, # bold => $FnT->{Bold}, # italic => $FnT->{Italic}, # uline => $FnT->{Underline}, # fgcolor => _xls_color ($FnT->{Color}), # bgcolor => _xls_fill (@{$FmT->{Fill}}), formula => $oWkC->formula, }; #_dump "cell", $sheet{attr}[$c + 1][$r + 1]; if ($opt{merge} && $merged and my $p_cell = Spreadsheet::Read::Sheet::merged_from(\%sheet, $c + 1, $r + 1)) { $sheet{attr}[$c + 1][$r + 1]{merged} = $p_cell; if ($cell ne $p_cell) { my ($C, $R) = cell2cr ($p_cell); $sheet{cell}[$c + 1][$r + 1] = $sheet{cell}[$C][$R]; $sheet{$cell} = $sheet{$p_cell}; } } } } } for (@{$sheet{cell}}) { defined or $_ = []; } push @data, { %sheet }; if ($sheet{label} eq "-- unlabeled --") { $sheet{label} = ""; } else { $data[0]{sheet}{$sheet{label}} = $#data; } } return _clipsheets \%opt, [ @data ]; } if ($opt{parser} ? _parser ($opt{parser}) eq "sc" : $io_fil ? $txt =~ m/\.sc$/ : $txt =~ m/^# .*SquirrelCalc/) { if ($io_ref) { local $/; my $x = <$txt>; $txt = $x; } elsif ($io_fil) { local $/; open my $sc, "<", $txt or return; $txt = <$sc>; close $sc; } $txt =~ m/\S/ or return; my $label = defined $opt{label} ? $opt{label} : "sheet"; my @data = ( { type => "sc", parser => "Spreadsheet::Read", version => $VERSION, parsers => [{ type => "sc", parser => "Spreadsheet::Read", version => $VERSION, }], error => undef, sheets => 1, sheet => { $label => 1 }, }, { parser => 0, label => $label, maxrow => 0, maxcol => 0, cell => [], attr => [], merged => [], active => 1, hidden => 0, }, ); for (split m/\s*[\r\n]\s*/, $txt) { if (m/^dimension.*of ([0-9]+) rows.*of ([0-9]+) columns/i) { @{$data[1]}{qw(maxrow maxcol)} = ($1, $2); next; } s/^r([0-9]+)c([0-9]+)\s*=\s*// or next; my ($c, $r) = map { $_ + 1 } $2, $1; if (m/.* \{(.*)}$/ or m/"(.*)"/) { my $cell = cr2cell ($c, $r); $opt{rc} and $data[1]{cell}[$c][$r] = $1; $opt{cells} and $data[1]{$cell} = $1; $opt{attr} and $data[1]{attr}[$c + 1][$r] = { @def_attr }; next; } # Now only formula's remain. Ignore for now # r67c7 = [P2L] 2*(1000*r67c5-60) } for (@{$data[1]{cell}}) { defined or $_ = []; } return _clipsheets \%opt, [ @data ]; } if ($opt{parser} ? _parser ($opt{parser}) eq "gnumeric" : _txt_is_xml ($txt, "http://www.gnumeric.org/v10.dtd")) { $can{gnumeric} or croak _missing_parser ("gnumeric"); my $gnm = $can{gnumeric}->new (%parser_opts, attr => $opt{attr}, cells => $opt{cells}, merge => $opt{merge}, rc => $opt{rc}, gzipped_p => $opt{gzipped_p}); return _clipsheets \%opt, $gnm->parse ($txt); } if ($opt{parser} ? _parser ($opt{parser}) eq "sxc" : ($txt =~ m/^<\?xml/ or -f $txt)) { $can{sxc} or croak _missing_parser ("SXC"); ref $txt && $can{sxc}->VERSION <= 0.23 and croak ("Sorry, references as input are not supported by Spreadsheet::ReadSXC before 0.23"); my $using = "using $can{sxc}-" . $can{sxc}->VERSION; my $sxc_options = { %parser_opts, OrderBySheet => 1 }; # New interface 0.20 and up my $sxc; if ($txt =~ m/\.(sxc|ods)$/i) { $debug and print STDERR "Opening \U$1\E $txt $using\n"; $debug and print STDERR __FILE__, "#", __LINE__, "\n"; $sxc = Spreadsheet::ReadSXC::read_sxc ($txt, $sxc_options) or return; } # treat all refs as a filehandle elsif (ref $txt) { $debug and print STDERR "Opening SXC filehandle\n"; $sxc = Spreadsheet::ReadSXC::read_sxc_fh ($txt, $sxc_options) or return; } elsif ($txt =~ m/\.xml$/i) { $debug and print STDERR "Opening XML $txt $using\n"; $sxc = Spreadsheet::ReadSXC::read_xml_file ($txt, $sxc_options) or return; } # need to test on pattern to prevent stat warning # on filename with newline elsif ($txt !~ m/^<\?xml/i and -f $txt) { $debug and print STDERR "Opening XML $txt $using\n"; open my $f, "<", $txt or return; local $/; $txt = <$f>; close $f; } !$sxc && $txt =~ m/^<\?xml/i and $sxc = Spreadsheet::ReadSXC::read_xml_string ($txt, $sxc_options); $debug > 8 and _dump (sxc => $sxc); if ($sxc) { my @data = ( { type => "sxc", parser => "Spreadsheet::ReadSXC", version => $Spreadsheet::ReadSXC::VERSION, parsers => [{ type => "sxc", parser => "Spreadsheet::ReadSXC", version => $Spreadsheet::ReadSXC::VERSION, }], error => undef, sheets => 0, sheet => {}, } ); my @sheets = ref $sxc eq "HASH" # < 0.20 ? map { { label => $_, data => $sxc->{$_}, } } keys %$sxc : @{$sxc}; foreach my $sheet (@sheets) { my @sheet = @{$sheet->{data} || []}; my %sheet = ( parser => 0, label => $sheet->{label}, maxrow => scalar @sheet, maxcol => 0, cell => [], attr => [], merged => [], active => 0, hidden => 0, ); my $sheet_idx = 1 + @data; $debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} rows\n"; foreach my $r (0 .. $#sheet) { my @row = @{$sheet[$r]} or next; foreach my $c (0 .. $#row) { defined (my $val = $row[$c]) or next; my $C = $c + 1; $C > $sheet{maxcol} and $sheet{maxcol} = $C; my $cell = cr2cell ($C, $r + 1); $opt{rc} and $sheet{cell}[$C][$r + 1] = $val; $opt{cells} and $sheet{$cell} = $val; $opt{attr} and $sheet{attr}[$C][$r + 1] = { @def_attr }; } } for (@{$sheet{cell}}) { defined or $_ = []; } $debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} x $sheet{maxcol}\n"; push @data, { %sheet }; $data[0]{sheets}++; $data[0]{sheet}{$sheet->{label}} = $#data; } return _clipsheets \%opt, [ @data ]; } } if (!ref $txt and $txt =~ m/\.\w+$/) { # Return (localized) system message open my $fh, "<", $txt and croak "I can open file $txt, but I do not know how to parse it\n"; $@ = $!; } return; } # ReadData sub add { my $book = shift; my $r = ReadData (@_) or return; $book && (ref $book eq "ARRAY" || ref $book eq __PACKAGE__) && $book->[0]{sheets} or return $r; my $c1 = $book->[0]; my $c2 = $r->[0]; unless ($c1->{parsers}) { $c1->{parsers}[0]{$_} = $c1->{$_} for qw( type parser version ); $book->[$_]{parser} = 0 for 1 .. $c1->{sheets}; } my ($pidx) = (grep { my $p = $c1->{parsers}[$_]; $p->{type} eq $c2->{type} && $p->{parser} eq $c2->{parser} && $p->{version} eq $c2->{version} } 0 .. $#{$c1->{parsers}}); unless (defined $pidx) { $pidx = scalar @{$c1->{parsers}}; $c1->{parsers}[$pidx]{$_} = $c2->{$_} for qw( type parser version ); } foreach my $sn (sort { $c2->{sheet}{$a} <=> $c2->{sheet}{$b} } keys %{$c2->{sheet}}) { my $s = $sn; my $v = 2; while (exists $c1->{sheet}{$s}) { $s = $sn."[".$v++."]"; } $c1->{sheet}{$s} = $c1->{sheets} + $c2->{sheet}{$sn}; $r->[$c2->{sheet}{$sn}]{parser} = $pidx; push @$book, $r->[$c2->{sheet}{$sn}]; } $c1->{sheets} += $c2->{sheets}; return $book; } # add package Spreadsheet::Read::Attribute; use Carp; use vars qw( $AUTOLOAD ); sub AUTOLOAD { my $self = shift; (my $attr = $AUTOLOAD) =~ s/.*:://; $self->{$attr}; } # AUTOLOAD package Spreadsheet::Read::Sheet; sub cell { my ($sheet, @id) = @_; @id == 2 && $id[0] =~ m/^[0-9]+$/ && $id[1] =~ m/^[0-9]+$/ and return $sheet->{cell}[$id[0]][$id[1]]; @id && $id[0] && exists $sheet->{$id[0]} and return $sheet->{$id[0]}; } # cell sub attr { my ($sheet, @id) = @_; my $class = "Spreadsheet::Read::Attribute"; @id == 2 && $id[0] =~ m/^[0-9]+$/ && $id[1] =~ m/^[0-9]+$/ and return bless $sheet->{attr}[$id[0]][$id[1]] => $class; if (@id && $id[0] && exists $sheet->{$id[0]}) { my ($c, $r) = $sheet->cell2cr ($id[0]); return bless $sheet->{attr}[$c][$r] => $class; } undef; } # attr sub maxrow { my $sheet = shift; return $sheet->{maxrow}; } # maxrow sub maxcol { my $sheet = shift; return $sheet->{maxcol}; } # maxrow sub col2label { $_[0] =~ m/::/ and shift; # class unused return Spreadsheet::Read::col2label (@_); } # col2label sub cr2cell { $_[0] =~ m/::/ and shift; # class unused return Spreadsheet::Read::cr2cell (@_); } # cr2cell sub cell2cr { $_[0] =~ m/::/ and shift; # class unused return Spreadsheet::Read::cell2cr (@_); } # cell2cr sub label { my ($sheet, $label) = @_; defined $label and $sheet->{label} = $label; return $sheet->{label}; } # label sub active { my $sheet = shift; return $sheet->{active}; } # label sub hidden { my $sheet = shift; return $sheet->{hidden}; } # label # my @row = $sheet->cellrow (1); sub cellrow { my ($sheet, $row) = @_; defined $row && $row > 0 && $row <= $sheet->{maxrow} or return; my $s = $sheet->{cell}; map { $s->[$_][$row] } 1..$sheet->{maxcol}; } # cellrow # my @row = $sheet->row (1); sub row { my ($sheet, $row) = @_; defined $row && $row > 0 && $row <= $sheet->{maxrow} or return; map { $sheet->{$sheet->cr2cell ($_, $row)} } 1..$sheet->{maxcol}; } # row # my @col = $sheet->cellcolumn (1); sub cellcolumn { my ($sheet, $col) = @_; defined $col && $col > 0 && $col <= $sheet->{maxcol} or return; my $s = $sheet->{cell}; map { $s->[$col][$_] } 1..$sheet->{maxrow}; } # cellcolumn # my @col = $sheet->column (1); sub column { my ($sheet, $col) = @_; defined $col && $col > 0 && $col <= $sheet->{maxcol} or return; map { $sheet->{$sheet->cr2cell ($col, $_)} } 1..$sheet->{maxrow}; } # column # Convert {cell}'s [column][row] to a [row][column] list # my @rows = $sheet->rows (); sub rows { my $sheet = shift; my $s = $sheet->{cell}; map { my $r = $_; [ map { $s->[$_][$r] } 1..$sheet->{maxcol} ]; } 1..$sheet->{maxrow}; } # rows sub merged_from { my ($sheet, @id, $col, $row) = @_; my $ma = $sheet->{merged} or return; if (@id == 2 && $id[0] =~ m/^[0-9]+$/ && $id[1] =~ m/^[0-9]+$/) { ($col, $row) = @id; } elsif (@id && $id[0] && exists $sheet->{$id[0]}) { ($col, $row) = cell2cr ($id[0]); } defined $row && $row > 0 && $row <= $sheet->{maxrow} or return; defined $col && $col > 0 && $col <= $sheet->{maxcol} or return; foreach my $range (@{$ma}) { my ($ctl, $rtl, $cbr, $rbr) = @{$range}; $col >= $ctl && $col <= $cbr or next; $row >= $rtl && $row <= $rbr or next; return cr2cell ($ctl, $rtl); } } # cell 1; BEGIN { $INC{"Z10/Just/For/Testing.pm"} = $0; $INC{"Z20/Just/For/Testing.pm"} = $0; $Z10::Just::For::Testing::VERSION = "1.00"; $Z20::Just::For::Testing::VERSION = undef; } package Z10::Just::For::Testing; 1; package Z20::Just::For::Testing; 1; __END__ =head1 DESCRIPTION Spreadsheet::Read tries to transparently read *any* spreadsheet and return its content in a universal manner independent of the parsing module that does the actual spreadsheet scanning. For OpenOffice and/or LibreOffice this module uses L For Microsoft Excel this module uses L, L, or L (strongly discouraged). For CSV this module uses L or L. For SquirrelCalc there is a very simplistic built-in parser =head2 Data structure The data is returned as an array reference: $book = [ # Entry 0 is the overall control hash { sheets => 2, sheet => { "Sheet 1" => 1, "Sheet 2" => 2, }, parsers => [ { type => "xls", parser => "Spreadsheet::ParseExcel", version => 0.59, }], error => undef, }, # Entry 1 is the first sheet { parser => 0, label => "Sheet 1", maxrow => 2, maxcol => 4, cell => [ undef, [ undef, 1 ], [ undef, undef, undef, undef, undef, "Nugget" ], ], attr => [], merged => [], active => 1, hidden => 0, A1 => 1, B5 => "Nugget", }, # Entry 2 is the second sheet { parser => 0, label => "Sheet 2", : : To keep as close contact to spreadsheet users, row and column 1 have index 1 too in the C element of the sheet hash, so cell "A1" is the same as C [1, 1] (column first). To switch between the two, there are helper functions available: C, C, and C. The C hash entry contains unformatted data, while the hash entries with the traditional labels contain the formatted values (if applicable). The control hash (the first entry in the returned array ref), contains some spreadsheet meta-data. The entry C is there to be able to find the sheets when accessing them by name: my %sheet2 = %{$book->[$book->[0]{sheet}{"Sheet 2"}]}; =head2 Formatted vs Unformatted The difference between formatted and unformatted cells is that the (optional) format is applied to the cell or not. This part is B implemented on the parser side. Spreadsheet::Read just makes both available if these are supported. Options provide means to disable either. If the parser does not provide formatted cells - like CSV - both values are equal. To show what this implies: use Spreadsheet::Read; my $file = "files/example.xlsx"; my $workbook = Spreadsheet::Read->new ($file); my $info = $workbook->[0]; say "Parsed $file with $info->{parser}-$info->{version}"; my $sheet = $workbook->sheet (1); say join "\t" => "Formatted:", $sheet->row (1); say join "\t" => "Unformatted:", $sheet->cellrow (1); Might return very different results depending one the underlying parser (and its version): Parsed files/example.xlsx with Spreadsheet::ParseXLSX-0.27 Formatted: 8-Aug Foo & Barr < Quux Unformatted: 39668 Foo & Barr < Quux Parsed files/example.xlsx with Spreadsheet::XLSX-0.15 Formatted: 39668 Foo & Barr < Quux Unformatted: 39668 Foo & Barr < Quux =head2 Functions and methods =head3 new my $book = Spreadsheet::Read->new (...) or die $@; All options accepted by ReadData are accepted by new. With no arguments at all, $book will be an object where sheets can be added using C my $book = Spreadsheet::Read->new (); $book->add ("file.csv"); $book->add ("file.cslx"); =head3 ReadData my $book = ReadData ($source [, option => value [, ... ]]); my $book = ReadData ("file.csv", sep => ',', quote => '"'); my $book = ReadData ("file.xls", dtfmt => "yyyy-mm-dd"); my $book = ReadData ("file.ods"); my $book = ReadData ("file.sxc"); my $book = ReadData ("content.xml"); my $book = ReadData ($content); my $book = ReadData ($content, parser => "xlsx"); my $book = ReadData ($fh, parser => "xlsx"); my $book = ReadData (\$content, parser => "xlsx"); Tries to convert the given file, string, or stream to the data structure described above. Processing Excel data from a stream or content is supported through a L temporary file or L when available. L does preserve sheet order as of version 0.20. Choosing between C<$content> and C<\\$content> (with or without passing the desired C option) may be depending on trial and terror. C does try to determine parser type on content if needed, but not all combinations are checked, and not all signatures are builtin. Currently supported options are: =over 2 =item parser X Force the data to be parsed by a specific format. Possible values are C, C (or C), C (or C), C (or C, C, C, C) C (or C), and C (or C). When parsing streams, instead of files, it is highly recommended to pass this option. Spreadsheet::Read supports several underlying parsers per spreadsheet type. It will try those from most favored to least favored. When you have a good reason to prefer a different parser, you can set that in environment variables. The other options then will not be tested for: env SPREADSHEET_READ_CSV=Text::CSV_PP ... You can also directly pass the required backend, forcing the matching type, but this excludes version checking. # Checks for minimal version BEGIN { $ENV{SPREADSHEET_READ_CSV} = "Text::CSV_PP" } my $book = ReadData ("test.csv", parser => "csv"); vs # NO check for minimal version my $book = ReadData ("test.csv", parser => "Text::CSV_PP"); =item cells X Control the generation of named cells ("C" etc). Default is true. =item rc Control the generation of the {cell}[c][r] entries. Default is true. =item attr Control the generation of the {attr}[c][r] entries. Default is false. See L below. =item clip If set, L|/ReadData> will remove all trailing rows and columns per sheet that have no data, where no data means only undefined or empty cells (after optional stripping). If a sheet has no data at all, the sheet will be skipped entirely when this attribute is true. =item trim =item strip If set, L|/ReadData> will remove trailing- and/or leading-whitespace from every field. strip leading strailing ----- ------- --------- 0 n/a n/a 1 strip n/a 2 n/a strip 3 strip strip C and C are aliases. If passed both, C is ignored because of backward compatibility. =item transpose =item pivot Swap all rows and columns. When a sheet contains data like A1 B1 C1 E1 A2 C2 D2 A3 B3 C3 D3 E3 using C or C will return the sheet data as A1 A2 A3 B1 B3 C1 C2 C3 D2 D3 E1 E3 C and C are aliases. If passed both, C is ignored because of backward compatibility. =item sep Set separator for CSV. Default is comma C<,>. =item quote Set quote character for CSV. Default is C<">. =item dtfmt Set the format for MS-Excel date fields that are set to use the default date format. The default format in Excel is "C", which is both not year 2000 safe, nor very useful. The default is now "C", which is more ISO-like. Note that date formatting in MS-Excel is not reliable at all, as it will store/replace/change the date field separator in already stored formats if you change your locale settings. So the above mentioned default can be either "C" OR "C" depending on what that specific character happened to be at the time the user saved the file. =item merge Copy content to all cells in merged areas. If supported, this will copy formatted and unformatted values from the top-left cell of a merged area to all other cells in the area. =item debug Enable some diagnostic messages to STDERR. The value determines how much diagnostics are dumped (using L). A value of C<9> and higher will dump the entire structure from the back-end parser. =item passwd Use this password to decrypt password protected spreadsheet. Currently only supports Excel. =back All other attributes/options will be passed to the underlying parser if that parser supports attributes. =head3 col2label my $col_id = col2label (col); my $col_id = $book->col2label (col); # OO C converts a C<(column)> (1 based) to the letters used in the traditional cell notation: my $id = col2label ( 4); # $id now "D" my $id = col2label (28); # $id now "AB" =head3 cr2cell my $cell = cr2cell (col, row); my $cell = $book->cr2cell (col, row); # OO C converts a C<(column, row)> pair (1 based) to the traditional cell notation: my $cell = cr2cell ( 4, 14); # $cell now "D14" my $cell = cr2cell (28, 4); # $cell now "AB4" =head3 cell2cr my ($col, $row) = cell2cr ($cell); my ($col, $row) = $book->cell2cr ($cell); # OO C converts traditional cell notation to a C<(column, row)> pair (1 based): my ($col, $row) = cell2cr ("D14"); # returns ( 4, 14) my ($col, $row) = cell2cr ("AB4"); # returns (28, 4) =head3 row my @row = row ($sheet, $row) my @row = Spreadsheet::Read::row ($book->[1], 3); my @row = $book->row ($sheet, $row); # OO Get full row of formatted values (like C<< $sheet->{A3} .. $sheet->{G3} >>) Note that the indexes in the returned list are 0-based. C is not imported by default, so either specify it in the use argument list, or call it fully qualified. See also the C method on sheets. =head3 cellrow my @row = cellrow ($sheet, $row); my @row = Spreadsheet::Read::cellrow ($book->[1], 3); my @row = $book->cellrow ($sheet, $row); # OO Get full row of unformatted values (like C<< $sheet->{cell}[1][3] .. $sheet->{cell}[7][3] >>) Note that the indexes in the returned list are 0-based. C is not imported by default, so either specify it in the use argument list, or call it fully qualified or as method call. See also the C method on sheets. =head3 rows my @rows = rows ($sheet); my @rows = Spreadsheet::Read::rows ($book->[1]); my @rows = $book->rows (1); # OO Convert C<{cell}>'s C<[column][row]> to a C<[row][column]> list. Note that the indexes in the returned list are 0-based, where the index in the C<{cell}> entry is 1-based. C is not imported by default, so either specify it in the use argument list, or call it fully qualified. =head3 parses parses ($format); Spreadsheet::Read::parses ("CSV"); $book->parses ("CSV"); # OO C returns Spreadsheet::Read's capability to parse the required format or C<0> if it does not. L|/ReadData> will pick its preferred parser for that format unless overruled. See L|/parser>. C is not imported by default, so either specify it in the use argument list, or call it fully qualified. If C<$format> is false (C, C<"">, or C<0>), C will return a sorted list of supported types. @my types = parses (""); # e.g: csv, ods, sc, sxc, xls, xlsx =head3 parsers my @p = parsers (); C returns a list of hashrefs with information about supported parsers, each giving information about the parser, its versions and if it will be used as default parser for the given type, like: { ext => "csv", # extension or type mod => "Text::CSV_XS", # parser module min => "0.71", # module required version vsn => "1.45", # module installed version def => "*", # is default for ext } As the modules are actually loaded to get their version, do only use this to analyse prerequisites. =head3 Version my $v = Version () my $v = Spreadsheet::Read::Version () my $v = Spreadsheet::Read->VERSION; my $v = $book->Version (); # OO Returns the current version of Spreadsheet::Read. C is not imported by default, so either specify it in the use argument list, or call it fully qualified. This function returns exactly the same as C<< Spreadsheet::Read->VERSION >> returns and is only kept for backward compatibility reasons. =head3 sheets my $sheets = $book->sheets; # OO my @sheets = $book->sheets; # OO In scalar context return the number of sheets in the book. In list context return the labels of the sheets in the book. This list only returns known unique labels in sorted order. Sheets could have no label or there can be more sheets with the same label (depends on the spreadsheet format and the parser used). =head3 sheet my $sheet = $book->sheet (1); # OO my $sheet = $book->sheet ("Foo"); # OO Return the numbered or named sheet out of the book. Will return C if there is no match. Will not work for sheets I with a number between 1 and the number of sheets in the book. With named sheets will first try to use the list of sheet-labels as stored in the control structure. If no match is found, it will scan the actual labels of the sheets. In that case, it will return the first matching sheet. If defined, the returned sheet will be of class C. =head3 add my $book = ReadData ("file.csv"); Spreadsheet::Read::add ($book, "file.xlsx"); my $book = Spreadsheet::Read->new ("file.csv"); $book->add ("file.xlsx"); # OO =head2 Methods on sheets =head3 maxcol my $col = $sheet->maxcol; Return the index of the last in-use column in the sheet. This index is 1-based. =head3 maxrow my $row = $sheet->maxrow; Return the index of the last in-use row in the sheet. This index is 1-based. =head3 cell my $cell = $sheet->cell ("A3"); my $cell = $sheet->cell (1, 3); Return the value for a cell. Using tags will return the formatted value, using column and row will return unformatted value. =head3 attr my $cell = $sheet->attr ("A3"); my $cell = $sheet->attr (1, 3); Return the attributes of a cell. Only valid if attributes are enabled through option C. =head3 col2label my $col_id = $sheet->col2label (col); C converts a C<(column)> (1 based) to the letters used in the traditional cell notation: my $id = $sheet->col2label ( 4); # $id now "D" my $id = $sheet->col2label (28); # $id now "AB" =head3 cr2cell my $cell = $sheet->cr2cell (col, row); C converts a C<(column, row)> pair (1 based) to the traditional cell notation: my $cell = $sheet->cr2cell ( 4, 14); # $cell now "D14" my $cell = $sheet->cr2cell (28, 4); # $cell now "AB4" =head3 cell2cr my ($col, $row) = $sheet->cell2cr ($cell); C converts traditional cell notation to a C<(column, row)> pair (1 based): my ($col, $row) = $sheet->cell2cr ("D14"); # returns ( 4, 14) my ($col, $row) = $sheet->cell2cr ("AB4"); # returns (28, 4) =head3 col my @col = $sheet->column ($col); Get full column of formatted values (like C<< $sheet->{C1} .. $sheet->{C9} >>) Note that the indexes in the returned list are 0-based. =head3 cellcolumn my @col = $sheet->cellcolumn ($col); Get full column of unformatted values (like C<< $sheet->{cell}[3][1] .. $sheet->{cell}[3][9] >>) Note that the indexes in the returned list are 0-based. =head3 row my @row = $sheet->row ($row); Get full row of formatted values (like C<< $sheet->{A3} .. $sheet->{G3} >>) Note that the indexes in the returned list are 0-based. =head3 cellrow my @row = $sheet->cellrow ($row); Get full row of unformatted values (like C<< $sheet->{cell}[1][3] .. $sheet->{cell}[7][3] >>) Note that the indexes in the returned list are 0-based. =head3 rows my @rows = $sheet->rows (); Convert C<{cell}>'s C<[column][row]> to a C<[row][column]> list. Note that the indexes in the returned list are 0-based, where the index in the C<{cell}> entry is 1-based. =head3 merged_from my $top_left = $sheet->merged_from ("C2"); my $top_left = $sheet->merged_from (3, 2); If the parser supports merged areas, this method will return the label of the top-left cell in the merged area the requested cell is part of. If the requested ID is valid and withing the sheet cell range, but not part of a merged area, it will return C<"">. If the ID is not valid or out of range, it returns C. See L for more details. =head3 label my $label = $sheet->label; $sheet->label ("New sheet label"); Set a new label to a sheet. Note that the index in the control structure will I be updated. =head3 active my $sheet_is_active = $sheet->active; Returns 1 if the selected sheet is active, otherwise returns 0. Currently only works on XLS (as of Spreadsheed::ParseExcel-0.61). CSV is always active. =head3 hidden my $sheet_is_hidden = $sheet->hidden; Returns 1 if the selected sheet is hidden, otherwise returns 0. Fully depends on the backend supporting this. CSV and SC are never hidden. =head2 Using CSV In case of CSV parsing, L|/ReadData> will use the first line of the file to auto-detect the separation character if the first argument is a file and both C and C are not passed as attributes. L (or L) is able to automatically detect and use C<\r> line endings. CSV can parse streams too, but be sure to pass C and/or C if these do not match the default C<,> and C<">. When an error is found in the CSV, it is automatically reported (to STDERR). The structure will store the error in C<< $ss->[0]{error} >> as anonymous list returned by Lerror_diag >>|https://metacpan.org/pod/Text::CSV_XS#error_diag>. See L for documentation. my $ss = ReadData ("bad.csv"); $ss->[0]{error} and say $ss->[0]{error}[1]; As CSV has no sheet labels, the default label for a CSV sheet is its filename. For CSV, this can be overruled using the I/L pair. =item Spreadsheet::Perl L offers a Pure Perl implementation of a spreadsheet engine. Users that want this format to be supported in Spreadsheet::Read are hereby motivated to offer patches. It is not high on my TODO-list. =item Spreadsheet::CSV L offers the interesting approach of seeing all supported spreadsheet formats as if it were CSV, mimicking the L interface. =item xls2csv L offers an alternative for my C, in the xls2csv tool, but this tool focuses on character encoding transparency, and requires some other modules. =back =head1 AUTHOR H.Merijn Brand =head1 COPYRIGHT AND LICENSE Copyright (C) 2005-2024 H.Merijn Brand This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut