#!/usr/bin/perl
#===============================================================================
#
#         File:  gmailarchiver.pl
#
#  Description: Takes an mbox mailing list archive (i.e.
#               mailman) and copies the contents to your
#               gmail account into the specified folder.
#
#       Author:  Andy Harrison <aharrison@gmail.com>
#      Version:  $Id: gmailarchiver.pl,v 2.1 2008/11/19 18:19:57 aharrison Exp $
#     Revision:  $Revision: 2.1 $
#===============================================================================

# {{{ Modules and Variables
#

our $VERSION = sprintf "%d.%d", q$Revision: 2.1 $ =~ /(\d+)/g;

use strict;
use warnings;
$|++;

use Getopt::Long qw/:config auto_help auto_version/;
use Mail::IMAPClient;
use IO::Socket::SSL;
use Email::Folder;
use Email::Folder::Mbox;
use Date::Manip;

$SIG{'INT'} = 'diesig';



# }}}

# {{{ Commandline Options
#

my $opts = {};

# Hardcode your username and password here:
#
#$opts->{dest_user}   = '';
#$opts->{dest_pw}     = '';

$opts->{dest_server} = 'imap.gmail.com';
$opts->{dest_port}   = '993';
$opts->{debug}       = 0;
$opts->{verbose}     = 0;
$opts->{sleep}       = 0;
$opts->{wait}        = 0;

GetOptions( $opts,

                    'url=s@{,}',
                    'mbox=s@{,}',
                    'mailman|archive_page=s',
                    'dest_server|server=s',
                    'dest_port|port=i',
                    'dest_folder|folder=s',
                    'dest_user|user=s',
                    'dest_pw|pw=s',
                    'subject=s',
                    'sleep=i',
                    'wait=i',

                    'debug!',
                    'test!',
                    'verbose!',

          );

dieright("Must specify a folder name for the destination: $@\n")
    unless defined $opts->{dest_folder};

dieright("Must specify a username and password...\n")
    unless defined $opts->{dest_user} && defined $opts->{dest_pw};

#
# }}}

# {{{ main
#

# {{{ handle the mbox objects
#

# All mbox files are converted to Email::Simple objects and
# stuffed into @$mboxes which basically becomes a list of
# mbox objects.
#
my $mboxes = {};

# If --mailman <url> was specified, check that web page for
# mailman archive urls and then just add them to the $opts
# hash as if they were specified on the commandline with the
# --url option.
#
if ( defined $opts->{mailman} && $opts->{mailman} ) {
    find_archive_urls({ url => $opts->{mailman} });
} 

if ( defined $opts->{mbox} && ref $opts->{mbox} eq 'ARRAY' ) {

    # Handle --mbox arguments
    #
    for my $cur_mbox ( @{ $opts->{mbox} } ) {

        # Check and make sure that it's a readable file.
        #
        if ( ! -r $cur_mbox ) {
            warn "MBOX file " . $cur_mbox . " not readable...\n";
        }

        if ( $cur_mbox =~ m/\.gz$/ ) {

            my $extracted_file = $cur_mbox;

            $extracted_file =~ s/\.gz$//;

            gunzip( $cur_mbox, $extracted_file );

            $cur_mbox = $extracted_file;

        }

        $mboxes->{$cur_mbox}->{object} = Email::Folder->new( $cur_mbox )
            or dieright( "Error creating Mbox object...\n" );

        # Filename is stored as a separate key because .gz
        # files will end up with the filename being
        # different.
        #
        $mboxes->{$cur_mbox}->{filename} = $cur_mbox;

    }

} elsif ( defined $opts->{url} && $opts->{url} && ref $opts->{url} eq 'ARRAY' ) {

    # Handle --url commandline arguments
    #
    for my $cur_url ( @{ $opts->{url} } ) {

        my $mbox_filename = fetch_http_archive({ url => $cur_url });

        $mboxes->{$mbox_filename}->{object} = Email::Folder->new($mbox_filename)
            or dieright( "Error creating Mbox object...\n" );

        $mboxes->{$mbox_filename}->{filename} = $mbox_filename;

    }

} else {

    dieright( "No mbox files specified...\n" );

}

# If the list of mbox objects is empty, something went
# awry...
#
dieright( "No valid mbox files found...\n" )
    unless scalar keys %$mboxes > 0;

# }}}

# {{{ Create the ssl socket for connecting to gmail
#

if ( $opts->{wait} ) {

    print 'Waiting '
        . $opts->{wait}
        . " seconds before connecting to imap server...\n"
        if $opts->{verbose};

    sleep $opts->{wait};

}

my $ssl_socket = IO::Socket::SSL->new( Proto    => 'tcp',
                                       PeerAddr => $opts->{dest_server},
                                       PeerPort => $opts->{dest_port},)
    or warn "Problem creating ssl socket object: $@\n"
        . 'Error: '
        . IO::Socket::SSL::errstr()
        . "\n";


dbg( 'ssl_socket->dump_peer_certificate',
     $ssl_socket->dump_peer_certificate() ) if $opts->{debug};

# }}}

# {{{ Create the imap object for gmail
#
open( DBG, ">>debug.log" );

my $gmail = Mail::IMAPClient->new(
    User     => $opts->{dest_user},
    Password => $opts->{dest_pw},
    Clear    => 0,
    Uid      => 0,
    Fast_io  => 0,
    Debug    => 1,
    Debug_fh => *DBG,
    Socket   => $ssl_socket,
)
    or warn "Problem with new imap object: $@\n";

dbg( 'gmail',            $gmail                ) if $opts->{debug};
dbg( 'gmail->Connected', $gmail->Connected()   ) if $opts->{debug};
dbg( 'gmail->State()',   $gmail->State()       ) if $opts->{debug};
dbg( 'gmail->Status()',  $gmail->Status()      ) if $opts->{debug};

$gmail->select( $opts->{dest_folder} )
    or dieright( "Error Selecting imap folder "
                . $opts->{dest_folder}
                . ": $@\n" );

print "Selected folder: " . $opts->{dest_folder} . "\n";


# }}}

# Iterate all the mbox objects and copy them to imap mailbox
#
for my $mbox ( keys %$mboxes ) {

    # Grab the list of messages from the mbox file and start
    # iterating through them and copying them to the imap
    # folder.  @messages becomes a list of Email::Simple
    # objects.
    #
    my @messages = $mboxes->{$mbox}->{object}->messages();

    print scalar( @messages ) . " messages found in mbox file" . $mboxes->{$mbox}->{filename} . "\n";

    for my $message ( @messages ) {

        dbg( 'current_message', $message                      ) if $opts->{debug};
        dbg( 'current_message_as_string', $message->as_string ) if $opts->{debug};

        if ( $opts->{verbose} ) {

            print "\tCurrent message:\t"
                . $message->header('Subject')
                . "\n";

        } else {

            print '.';

        }

        # The date contained in the message headers must be
        # parsed and given to the
        # Mail::IMAPClient->append_string method, otherwise
        # the datestamp will not be preserved.
        #
        my $date = $message->header('Date');
        $date = UnixDate(ParseDate($date), "%d-%b-%Y %H:%M:%S %z");
        $date = "\"$date\"";

        dieright( "Gmail object vanished... check debug.log\n" )
            unless defined $gmail;

        my $error = 0;

        # Copy the current message up to the imap server
        # into the desired folder.
        #
        my $result = $gmail->append_string( $opts->{dest_folder}, $message->as_string, "", $date );

        dbg( 'result',     $result )     if $opts->{debug};
        dbg( 'ref_result', ref $result ) if $opts->{debug};

        if (defined $result && ! $result->LastError eq '' ) {
            warn "Error during append_string method: \n" . $result->LastError . "\n\n\n";
        } else {
            print "Deleting file " . $mboxes->{$mbox}->{filename} . "\n";
            unlink $mboxes->{$mbox}->{filename};
        }

        # Optionally introduce some delay.
        #
        if ( $opts->{sleep} ) {
            sleep $opts->{sleep};
        }

    }

    print "\n";

}

# Disconnect from the imap server
#
$gmail->disconnect()
    or dieright( "Error disconnecting from imap server...\n" );

# }}}

# {{{ subs
#


# {{{ fetch_http_archive
#
sub fetch_http_archive {

    my $args = shift;

    my $url = $args->{url};

    # If --url was specified, first make sure it looks like
    # a real url that we're expecting for a mailman archive.
    #
    unless ( $url =~ m/^http:/ && $url =~ m/(txt|mbox).gz$/ ) {
        dieright( "not a valid mailman archive url: $!\n" )
    }

    dieright( "Please install WWW::Mechanize Module: $@\n" )
        unless eval { require WWW::Mechanize; };

    my $mech = WWW::Mechanize->new();

    # Split up the components of the url so we can just get
    # the filename.
    #
    my $link = URI->new( $url );
    my $path = $link->path;
    my @filename = $link->path_segments( $link->path );
    my $gz_file = $filename[-1];

    # Chop off the .gz for the destination filename.
    #
    my $extracted_file = $gz_file;
    $extracted_file =~ s/\.gz$//;

    # Fetch the archive via the web
    #
    $mech->get( $url, ":content_file" => $gz_file )
        or dieright( "Unable to fetch: $url : $!\n" );


    # Decompress the .gz file
    #
    gunzip( $gz_file, $extracted_file );

    return $extracted_file;

} # }}}

# find_archive_urls {{{
#
sub find_archive_urls {

    my $args = shift;
    my $url  = $args->{url};

    die "Please install WWW::Mechanize Module: $@\n"
        unless eval { require WWW::Mechanize; };

    my $mech = WWW::Mechanize->new();

    $mech->get($url)
        or die "Unable to fetch: $url, $!\n";

    # This grabs a list of link objects from the webpage.
    #
    my @archives_obj =
        $mech->find_all_links( url_regex => qr/\.(mbox|txt)\.gz$/ );

    # Take all the link objects and turn them into regular
    # urls that will be past to the fetch_http_archive so
    # that WWW::Mechanize can get them.
    #
    for my $link (@archives_obj) {

        my $base     = $link->base()->as_string();
        my $filename = $link->url();

        push @{ $opts->{url} }, $base . $filename;

    }

    dbg( 'opts->url', $opts->{url} ) if $opts->{debug};

}    # }}}

# {{{ gunzip
#
sub gunzip {

    # Lifted from CPAN.pm CPAN::Tarzip::gunzip
    #

    dieright( "Unable to load Compress::Zlib module: $@\n" )
        unless eval { require Compress::Zlib; };

    dieright( "Unable to load FileHandle module: $@\n" )
        unless eval { require FileHandle; };

    my( $read, $write ) = @_;
    my($buffer,$fhw);

    $fhw = FileHandle->new(">$write")
	    or dieright("Could not open >$write: $!");

    my $gz = Compress::Zlib::gzopen($read, "rb")
	    or dieright("Cannot gzopen $read: $!\n");

    $fhw->print($buffer) while $gz->gzread($buffer) > 0 ;

    dieright("Error reading from $read: $!\n")
	    if $gz->gzerror != Compress::Zlib::Z_STREAM_END();

    $gz->gzclose() ;
    $fhw->close;

    unlink $read;

} # }}}

# diesig {{{
#
sub diesig {

    my $sig = @_;
    dieright( "Caught signal $sig\n" );

} # }}}

# dieright {{{
#
sub dieright {

    print @_;

    if ( defined $gmail ) {
        $gmail->disconnect();
    }

    close DBG;

    exit 1;

} # }}}

# dbg {{{
sub dbg {

    my $label = shift;

    if ( eval qq{ require Data::Dumper::Names; } ) {

        print "$label: \n" . Data::Dumper::Names::Dumper(@_) . "\n";

    } elsif ( eval qq{ require Data::Dump; } ) {

        print "Data::Dump $label = \n" . Data::Dump::dump(@_) . "\n";

    } elsif ( eval qq{ require Data::Dumper; } ) {

        no warnings qw/once/;
        $Data::Dumper::Indent   = 3;
        $Data::Dumper::Useqq    = 1;
        $Data::Dumper::Purity   = 1;
        $Data::Dumper::Varname  = $label;
        $Data::Dumper::Varname .= '_';
        use warnings;

        print Data::Dumper::Dumper(@_) . "\n";

    }

} # }}}




# }}}

__END__

# {{{ END
#
#

=pod

=head1 NAME

gmailarchiver.pl - Archive mbox mailing list archives

=head1 SCRIPT CATEGORIES

Mail

=head1 README

Use this script to copy files from an mbox mailing list archive
(i.e. Mailman) to the specified folder in your gmail account.

=head1 OSNAMES

any

=head1 PREREQUISITES

C<Mail::IMAPClient> >= 3.10

C<IO::Socket::SSL>

C<Email::Folder>

C<Email::Folder::Mbox>

C<Date::Manip>

=head1 COREQUISITES

C<WWW::Mechanize> - to fetch Mailman (mbox format) archives

=head1 SYNOPSIS

=head2 OPTIONS AND ARGUMENTS

=over 15

=item B<--url>

Url to a specific mailing list archive file.  Multiple urls are allowed.

=item B<--mbox>

Filename of an mbox file to import.  Multiple mbox files are allowed.

=item B<--mailman> ( B<--archive_page> )

Url to a webpage containing links to mbox archives.  Links of list
archives must match .(mbox|txt).gz to be retrieved.

=item B<--server>

Specify the destination imap server hostname.  Default is imap.gmail.com.

=item B<--port>

Specify the destination imap server SSL port.  Default is 993.

=item B<--user>

The username used to authenticate with the imap server.

=item B<--pw>

The password used to authenticate with the imap server.

=item B<--folder>

The destination folder on the imap server.  The folder must already exist.

=item B<--sleep>

Introduce some delay between copying individual messages to the
destination imap folder.

=item B<--wait>

Introduce some delay before authenticating to the imap server.  Useful
if you're using xargs to specify numerous archive files.

=item B<--debug>

Print lots of ugly debugging output.

=item B<--verbose>

Print some extra information during processing.

=back

=head2 EXAMPLES

=over 15

=item C<gmailarchiver.pl> C<--url> 
I<http://lists.bestpractical.com/pipermail/rt-users/2004-July.txt.gz>
C<--folder> I<rt-users>

Copy all messages from the July 2004 archive of the RT-Users mailing list.

=back

=head1 ACKNOWLEDGEMENTS

Built largely using Mail::IMAPClient by E<lt>L<MARKOV@cpan.org|mailto:mark@overmeer.net>E<gt>
and the Email::Folder modules by E<lt>L<RJBS@cpan.org|mailto:rjbs@cpan.org>E<gt>

Lifted the gunzip routine from CPAN.pm written by
Andreas Koenig E<lt>L<andreas.koenig@anima.de|mailto:andreas.koenig@anima.de>E<gt>

=cut


