#
# LotusParser.pm (Contains LotusParser and BinaryRead classes)
# @desc A class that can be used to read Lotus 123 - .wk4 format files.
#
# Author: Mayur
#
# $Header$
#-------------------------------------------------------------------------

=head1 NAME

F<LotusParser.pm> - A class that can be used to read 
Lotus 123 - .wk4 format files.

=head1 SYNOPSIS

  use Parse::LotusParser;

  my $wk4parser = new Parse::LotusParser();
  $wk4parser->parse("test.wk4");

  my $n_sheets = $wk4parser->get_number_of_sheets();

  for (my $i = 0; $i < $n_sheets; $i++) {
      my $n_rows = $wk4parser->get_number_of_rows($i);

      for (my $j = 0; $j < $n_rows; $j++) {
          my $n_cols = $wk4parser->get_number_of_cols($i, $j);
          my $row = $wk4parser->get_row($i, $j);

          foreach my $col (@$row) {
              print ((defined $col) ? "$col, " : "null, ");
          }
          print "\n";
      }
  }

=head1 DESCRIPTION

A class that can be used to read Lotus 123 - .wk4 format files.
This parser reads only values from the file, it ignores formatting 
information.

Visit http://www.mettalogic.uklinux.net/tim/l123/l123r4.html for 
the exact format.

=cut

#------------------------------------------------------------------------#
#                              BinaryRead                                #
#------------------------------------------------------------------------#

=head1 INTERFACE of BinaryRead

=head2 DESCRIPTION

BinaryRead is a class that can be used to read binary files.
It provides two functions to read bytes:
get_bytes(num_bytes) and get_byte().
Both the functions return ascii values of the characters read.

=head2 INTERFACE

=cut

#-------------------------------------------------------------------------

our $VERSION = "1.0 beta";

# Package name
package BinaryRead;

use strict;

#-------------------------------------------------------------------------

=item * I<new(filename)>

Constructs a new instance of BinaryRead for the given file.

=cut

sub new {
    my ($class, $filename) = @_;

    my $self = {};
    bless $self, $class;

    if (!defined $filename) {
        warn "Filename is undefined";
        return undef;
    }

    my $IN;
    if (!open($IN, $filename)) {
        warn "Unable to open file \"$filename\"";
        return undef;
    }

    # Open in Binary mode.
    binmode($IN);
    # Save the file handle
    $self->{handle} = $IN;

    return $self;
}

#-------------------------------------------------------------------------

=item * I<get_bytes(num_bytes)>

Reads num_bytes from the input file and returns a reference to the 
array of bytes read. The values are ascii equivalents of characters 
read.

=cut

sub get_bytes($$) {
    my ($self, $n) = @_;

    warn "num_bytes is undefined or zero" if (!$n);

    my $buffer;
    my $IN = $self->{handle};
    sysread($IN, $buffer, $n);
    my @bytes = map(ord($_), split(//, $buffer));

    return \@bytes;
}

#-------------------------------------------------------------------------

=item * I<get_byte()>

Reads one byte from the input file and returns the ascii 
equivalent.

=cut

sub get_byte {
    my ($self) = @_;

    my $buffer;
    my $IN = $self->{handle};
    sysread($IN, $buffer, 1);

    return ord($buffer);
}

1;

#------------------------------------------------------------------------#
#                           LotusParser                               #
#------------------------------------------------------------------------#

=head1 INTERFACE of LotusParser

=cut

#-------------------------------------------------------------------------

# Package name
package Parse::LotusParser;

use warnings qw(all);
use strict;

#-------------------------------------------------------------------------

=item * I<new()>

Creates a new instance of LotusParser.

=cut

sub new {
    my ($class) = @_;

    my $self = {};
    bless $self, $class;

    # Initialize data.
    $self->{data} = [];

    return $self;
}

#-------------------------------------------------------------------------

=item * I<parse(filename)>

Parses the input file and caches the results.

=cut

sub parse($$) {
    my ($self, $filename) = @_;
    
    my $fh = new BinaryRead($filename);
    if (!defined $fh) {
        warn "Unable to parse file \"$filename\"";
        return undef;
    }

    # Cache the file handle.
    $self->{handle} = $fh;
    # Initialize data on every parse.
    $self->{data}   = [];
    
    while (1) {
        my $cell = $self->_get_next_cell();

        last if (!defined $cell);
        next if (!exists $cell->{value});

        # Cache the cell.
        $self->{data}->[$cell->{sheet}]->[$cell->{row}]->[$cell->{col}] = 
          $cell->{value};
    }

    # If any sheet or row is undef then assign an empty array to it.
    my $n_sheets = scalar(@{$self->{data}});
    for (my $i = 0; $i < $n_sheets; $i++) {
        # Assign an empty sheet.
        $self->{data}->[$i] = [] if (!defined $self->{data}->[$i]);

        my $n_rows = scalar(@{$self->{data}->[$i]});
        for (my $j = 0; $j < $n_rows; $j++) {
            # Assign an empty row.
            $self->{data}->[$i]->[$j] = [] 
                if (!defined $self->{data}->[$i]->[$j]);

            my $n_cols = scalar(@{$self->{data}->[$i]->[$j]});
            for (my $k = 0; $k < $n_cols; $k++) {
                # Assign a null value.
                $self->{data}->[$i]->[$j]->[$k] = ""
                    if (!defined $self->{data}->[$i]->[$j]->[$k]);
            }
        }
    }
}

#-------------------------------------------------------------------------

=item * I<get_row(sheet_number, row_number)>

Returns an array reference of a row (collection of column values).

=cut

sub get_row {
    my ($self, $sheet, $row) = @_;

    return undef if ($row >= $self->get_number_of_rows($sheet));

    return $self->{data}->[$sheet]->[$row];
}

#-------------------------------------------------------------------------

=item * I<get_col(sheet_number, row_number, col_number)>

Returns a column value in a specified [sheet, row, column].

=cut

sub get_col {
    my ($self, $sheet, $row, $col) = @_;

    my $row_ref = $self->get_row($sheet, $row);
    return $row_ref->[$col] 
        if (defined $row_ref && $col < scalar(@{$row_ref}));
    return undef;
}

#-------------------------------------------------------------------------

=item * I<get_number_of_sheets()>

Returns the number of sheets in the file that is parsed.

=cut

sub get_number_of_sheets {
    my ($self) = @_;

    return scalar(@{$self->{data}});
}

#-------------------------------------------------------------------------

=item * I<get_number_of_rows(sheet_number)>

Returns the number of rows in a given sheet in the file that is parsed.

(Works only after the file has been parsed).

=cut

sub get_number_of_rows {
    my ($self, $sheet) = @_;

    return scalar(@{$self->{data}->[$sheet]}) 
        if ($sheet < $self->get_number_of_sheets());
    return 0;
}

#-------------------------------------------------------------------------

=item * I<get_number_of_cols(sheet_number, row_number)>

Returns the number of columns in a row for a sheet in the file that is parsed.

=cut

sub get_number_of_cols {
    my ($self, $sheet, $row) = @_;

    my $row_ref = $self->get_row($sheet, $row);
    return scalar(@{$row_ref})
        if (defined $row_ref);
    return 0;
}

#-------------------------------------------------------------------------

=head1 PRIVATE FUNCTIONS

=item * I<_get_next_cell()>

Returns a cell of data. Returns undef if any one of 
eof | end-of-section | unhandled records is encountered.
The cell is a map having following fields:

=over 4

=item row   - The row number of the cell.

=item sheet - Sheet number of the cell.

=item col   - The column number of the cell.

=item value - Value stored in the cell. The value can

be Text, double, or undef.

=cut

sub _get_next_cell {
    my ($self) = @_;
    
    # Get the file handle.
    my $fh = $self->{handle};

    # Read a record. A record has 4 bytes. First 2 bytes determine the type 
    # and next two bytes tell the length of the record.
    my $hdr = $fh->get_bytes(4);
    return undef if (scalar(@$hdr) != 4);
    my $code = $hdr->[0] + $hdr->[1] * 256;
    my $length = $hdr->[2] + $hdr->[3] * 256;

    # Process the codes now.
    if ($code == 0x00) {
        # Start of a section.
        my $rec = $fh->get_bytes($length); # Usually length = 26.
        if (scalar(@$rec) != $length) {
            warn "Unable to read start section record";
            return undef;
        }
        my $map = {};
        return $map;
    }
    elsif ($code == 0x01) { # length must be zero.
        # End of a section. Only one section is required.
        return undef;
    }
    elsif ($code == 0x16) {
        # Text data. Variable length (= 5 + string length + '\0').
        # A cell reference has 4 bytes: row_low, row_high, sheet, and col.
        # The fifth byte is alignment.
        my $cell = $fh->get_bytes(5);
        my $text = "";
        while (1) {
            my $byte = $fh->get_byte();
            last if ($byte == 0);
            $text .= chr($byte);
        }
        my $map = {
                   row   => $cell->[0] + $cell->[1] * 256,
                   sheet => $cell->[2],
                   col   => $cell->[3],
                   value => $text,
                  };
        return $map;
    }
    elsif ($code == 0x17) {
        # Long double data. Length = 4 + 10 = 14.
        # A cell reference has 4 bytes: row_low, row_high, sheet, and col.
        my $cell = $fh->get_bytes(4);
        my $bytes = $fh->get_bytes(10);
        my $value = _convert10bytes2double($bytes);
        my $map = {
                   row   => $cell->[0] + $cell->[1] * 256,
                   sheet => $cell->[2],
                   col   => $cell->[3],
                   value => $value,
                  };
        return $map;
    }
    elsif ($code == 0x18) {
        # Encoded double data. Length = 4 + 2 = 6.
        # A cell reference has 4 bytes: row_low, row_high, sheet, and col.
        my $cell = $fh->get_bytes(4);
        my $e_value = $fh->get_bytes(2);
        my $value = _decode($e_value->[0], $e_value->[1]);
        my $map = {
                   row   => $cell->[0] + $cell->[1] * 256,
                   sheet => $cell->[2],
                   col   => $cell->[3],
                   value => $value,
                  };
        return $map;
    }
    else {
        # Other data (skip)
        my $rec = $fh->get_bytes($length);
        if (scalar(@$rec) != $length) {
            warn "Unable to read junk record";
            return undef;
        }
        # Return an empty map.
        my $map = {};
        return $map;
    }
}

#-------------------------------------------------------------------------

=item * I<_decode(bytes0, bytes1)>

Decodes an encoded number for record type 18.

=cut

sub _decode {
    my ($bytes0, $bytes1) = @_;

    my $h = ($bytes1 << 8) | $bytes0;
    return $h / 2 if (! ($h & 1));

    my $ld = $h >> 4;
    my $h1 = $h & 0x0F;
    # Violation coding conventions for clarity.
    if ($h1 == 0x1)    {return $ld * 500000;}
    elsif ($h1 == 0x3) {return $ld * 500;}
    elsif ($h1 == 0x5) {return $ld / 20;}
    elsif ($h1 == 0x7) {return $ld / 200;}
    elsif ($h1 == 0x9) {return $ld / 2000;}
    elsif ($h1 == 0xB) {return $ld / 20000;}
    elsif ($h1 == 0xD) {return $ld / 16;}
    elsif ($h1 == 0xF) {return $ld / 64;}

    return -1;
}

#-------------------------------------------------------------------------

=item * I<_convert10bytes2double(bytes)>

Converts 10 bytes (representing a long double) to double equivalent 
for record type 17.

It uses IEEE standard for this conversion.

The formula used is: (-1)^s * 1.M * 2^(E - Bias)

where, s = highest bit (0 for positive number, and 1 for negative number.)

M is 64 bit mantissa (the highest bit of M is ignored).

E is 15 bit exponent. Bias is (2^14 - 1).

=cut

sub _convert10bytes2double {
    my ($bytes) = @_;

    # Get the sign bit.
    my $sign = (($bytes->[9] >> 8) & 1) == 0 ? 1 : 0;
    # Get the exponent.
    my $exp = ($bytes->[9] & 0x7F) * 256 + $bytes->[8] - 16383;
    
    # Compute exponent.
    my $mantissa = 0.0;
    my ($i, $j);
    my $p = 1.0;
    for ($i = 7; $i >= 0; $i--) {
	my $start = ($i == 7) ? 6 : 7;
	for ($j = $start; $j >= 0; $j--) {
	    $p *= 2.0;
	    if (($bytes->[$i] >> $j) & 1) {
              $mantissa += (1.0 / $p);
          }
	}
    }

    # Compute 2^(E - Bias).
    $p = 1.0;
    for ($i = 0; $i < $exp; $i++) {
	$p *= 2.0;
    }
    
    # The double representation.
    return ($sign * (1 + $mantissa) * $p);
}

1;

#-------------------------------------------------------------------------

__END__

=head1 SEE ALSO

L<http://www.mettalogic.uklinux.net/tim/l123/l123r4.htm>

=head1 AUTHOR

Mayur L<srivastm@rediffmail.com>

=head1 COPYRIGHT

Parse::LotusParser was originally written by Mayur.
Any Perl developer can use or modify this module, provided that he/she 
adds this copyright notice to the script/modified code.

$Revision$

$Source$

=cut