#!/usr/local/bin/perl  -w

=head1 NAME

timefetch.pl -  Fetch a web page and time how long it takes

=head1 SYNOPSIS

C<timefetch.pl [dhjrv] [-f host] http://url [...]>

C<timefetch.pl -h   (for help message)>

=head1 README

This simple script uses LWP and Time::HiRes to fetch a web page,
parse embedded IMG tags and other tags, download the images,
and time the whole operation.

Depending on the command line options you use, it can just download
a simgle HTML page or recursively download all embedded images and
applets, and tally a grand total download time and total download size.  
This script always downloads embedded frames in framesets. 

This script does not implement a smart browser.   It basically does
two kinds of benchmarks:  best-case scenario and absolute worst-case
scenario.  The best-case scenario is when a browser downloads just the
contents of a single HTML page, and does not download any images, 
because they are in the cache.  The worst-case scenario is when 
the browser must download each and every image, one after the other. 

Neither one of these scenarios exactly matches real-live scenarios
you experience with a browser.  However, these numbers are an objective
measurement of a repeatible and well-defined process.  You can use
them for straightforward comparison of the download times of 
two different web pages, or chart the download time of a single
page during different kinds of loads and traffic patterns. 

One extra feature in this script is the "force-host" feature.  
You can force this script to munge each URL before fetching it.  
This is useful in a situation where the URLs in your HTML pages all
contain a load-balanced hostname, but you want to target a specific
member host, or compare two member hosts.  

This script is NOT a spider.  It will only fetch the inline images
and applets contained in a single web page.  However, it will download 
component frames in a frameset down to arbitrary levels.  

=head1 EXAMPLES

C<timefetch.pl -rv http://www.domain.com>

C<timefetch.pl http://www.domain.com>

C<timefetch.pl http://www.domain.com/~john/page.html>

=head1 PREREQUISITES

This script requires C<LWP> and C<Time::HiRes>.  As written, 
it requires Perl 5.004, but in a pinch you could modify it 
to run on earlier versions of Perl 5. 

=head1 COPYRIGHT

Copyright (c) 1998 John Nolan. All rights reserved.
This program is free software.  You may modify and/or
distribute it under the same terms as Perl itself.
This copyright notice must remain attached to the file.

=head1 REVISION

$Id: timefetch.pl,v 1.6 1998/11/05 02:53:30 john Exp $

=pod SCRIPT CATEGORIES

Web/Utilities

=cut
# ---------------------------------------------------------------------------


my $VERSION = 0.20;

$|++;       # Turn on autoflush
use 5.004;  # We will use "for my"


# ---------------------------------------------------------------------------
# Load modules
#
use Time::HiRes;
use HTTP::Request;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;   
use Getopt::Std;
use strict;

# ---------------------------------------------------------------------------
# Define global variables and objects up front
#
my ($Ua, $Pa, @Urls, $TotalTime, $TotalSize, $RealUrl);
my ($Debug, $Recursively, $Verbose, $Java, $ForceHost);


# This variable will remember whether we had to recurse
# in order to download frames.  It will contain a list of
# the *types* of files which we downloaded additionally
# (e.g., img, applet).
#
my $DidRecurse;

# These will hold lists of urls, along with time and size data. 
#
my %Cache;
my @SummaryCache;



# ---------------------------------------------------------------------------
# Subroutines
# ---------------------------------------------------------------------------


# ---------------------------------------------------------------------------
#
sub Usage {

	return "Usage: $0 [dhjrv] [-f host] http://url [http://url ... ]\n";
}


# ---------------------------------------------------------------------------
#
sub PrintHelpMsg {

	print Usage();

	print <<EOM;

 -r 	Recursive: download all images and calculate cumulative time
 -v 	Verbose: print out URL's as they are downloaded
 -d 	Debug: view all kinds of marginally useful output
 -h 	Help: print this help message
 -j 	Java: download java applets as well
 -f 	Force host: before doing recursive downloads, munge each URL
	and replace the host in the URL with some other host. 

	NOTE:  This program always downloads embedded frames and prints
	a cumulative total for frames and framesets, even if you did not
	specify a recursive download.   

EOM
	exit(0);
}


# ---------------------------------------------------------------------------
#
sub GetParams {

	getopts('df:hjrv') or die Usage();
	use vars qw( $opt_d $opt_r $opt_v $opt_j $opt_f $opt_h);

	$Debug       = ($opt_d ?     1  :    0 );
	$Recursively = ($opt_r ?     1  :    0 ); 
	$Verbose     = ($opt_v ?     1  :    0 );
	$Java        = ($opt_j ?     1  :    0 );
	$ForceHost   = ($opt_f ? $opt_f : undef);

	$Verbose = 1 if $Debug;

	PrintHelpMsg() if $opt_h;

	# If there are no URL's on the command line, bolt. 
	#
	die Usage() unless @ARGV;

	print "Forcing host $ForceHost .\n" if ($Verbose and $ForceHost);

	# Take URLs from the command line
	#
	return @ARGV;
}



# ---------------------------------------------------------------------------
# Print routine for nicely-formatted output
#
sub PrintUrl {

	my $url = (shift or die "Internal routine PrintUrl expects param!\n");

	print sprintf ("%6.3f %5.1fkb: %s\n",
		$Cache{$url}->{TIME},
		$Cache{$url}->{SIZE}/1000,
		$url);
}


# ---------------------------------------------------------------------------
# Fetch a URL and time how long it takes
#
sub FetchUrl {

	my $url = (shift or die "Internal routine FetchUrl expects param!\n");

	print "Fetching: $url\n" if $Debug;

	my $req = HTTP::Request->new (GET => $url);

	# Temporarily turn off warnings.  The bowels of LWP are returning
	# an obscure error here, which I don't know how to troubleshoot. 
	#
	$^W = 0;

	my $start  = Time::HiRes::gettimeofday();
	my $res    = $Ua->request ($req);
	my $finish = Time::HiRes::gettimeofday();

	$^W = 1;   # Turn warnings back on again. 

	# Calculate stats

	my $size = length $res->content;
	my $time = ($finish - $start);

	return ($time,$size,$res);
}


# ---------------------------------------------------------------------------
# HandleParsedLink
#
# This is a callback provided for handling HTML links found during parsing.  
# It is called once for each link found, with the $tag and %link
# passed as parameters.  $tag is the HTML tag where the link was found.  
# %links is a hash that contains the keyword/value pairs from the links 
# that contain URLs.  
#
# For example, if an HTML anchor was found, the $tag would be "a" 
# and %links would be (href=>"url").  We check each URL in %links.  
#  
sub HandleParsedLink {

	my ($tag, %links) = @_;
	my ($time, $size, $res, $urlobj, $url);

	# Flag for deciding whether we want to download a given URL
	#
	my $we_want_this_url = 0;

	print "\$tag = $tag\n" if $Debug;

	$we_want_this_url = 1 if $Recursively and $tag =~ /img/; 
	$we_want_this_url = 1 if $Java        and $tag =~ /applet/; 
	$we_want_this_url = 1 if                  $tag =~ /frame/; 

	return unless $we_want_this_url;

	# Examine the tag, and fetch its target if necessary. 
	#
	for my $key (keys %links) {

		print "$tag: $key:  $links{$key}\n" if $Debug;

		# Get the absolute URL
		#
		$urlobj = URI::URL->new( $links{$key}, $RealUrl );
		$url = $urlobj->abs;

		# Force a particular host by munging the actual URL 
		#
		$url =~ s#http://[^/]+#http://$ForceHost# if $ForceHost;

		# Remove any in-page anchor tags. 
		#
		$url =~ s/^([^#]*)#.*$/$1/;

		print "Examining url $url\n" if $Debug;

		# Process each URL that we have not seen before
		#
		unless (exists $Cache{$url}) {
			
			($time,$size,$res) = FetchUrl($url);

			$TotalSize += $size;
			$TotalTime += $time;

			$Cache{$url} = { TIME => $time, SIZE => $size };

			PrintUrl($url) if ($Verbose);

			# Keep a list each *type* of item 
			# that we have recursively downloaded
			#
			$DidRecurse .= " $tag" unless $DidRecurse =~ /$tag/;

			# If this was a frame tag, then recurse into it. 
			#
			$Pa->parse($res->content) if $tag =~ /frame/;
		}
	}
	1; # Return a nice happy true value. 
}

# ---------------------------------------------------------------------------
# Process each URL in the list. 
#
sub ExamineUrls {

	my @urls = @_;

	for my $url (@urls) {

		my ($size,$time,$res);

		$TotalTime = 0;
		$TotalSize = 0;
		$RealUrl = "";
		$DidRecurse = ""; 

		# Force a particular host by munging the actual URL 
		#
		if ($ForceHost) {

			$url =~ s#http://[^/]+#http://$ForceHost#;
		}

		print "Examining url $url\n" if $Debug;

		# We make two fetches, no matter what.  
	
		# First fetch.  This fetch will traverse redirects, so it is added
		# to the total stats for the main url.  We only care about the total 
		# if we are doing a recursive download. As a side effect, after traversing 
		# redirects, we learn the true URL of the page ($RealUrl). 

		# Fetch the URL! 
		#
		($time,$size,$res) = FetchUrl($url);

		if ($Recursively) {

			$TotalSize += $size;
			$TotalTime += $time;
		}

		# Now remember the URL we were redirected to.
		# NOTE this call to base() must come before we invoke 
		# the parse function.  Otherwise base() may croak. 
		#
		$RealUrl = $res->base(); 

		# Analyze the HTML we got back, and extract links. 
		# The handler HandleParsedLink will download each 
		# linked image, if we are doing a recursive download. 
		# It will also download framesets, and parse them recursively.  
		#
		$Pa->parse($res->content); 

		# Force a particular host by munging the actual URL 
		#
		$RealUrl =~ s#http://[^/]+#http://$ForceHost# if $ForceHost;

		print "Examining RealUrl $RealUrl\n" if ($Debug);

		# Second fetch.  This fetch does not involve any redirects, 
		# because we are hitting the real URL directly. The data 
		# is not tallied into the total for the main URL.  But we get 
		# the data anyway, so we have it for just that URL alone.  
		# We will need to report these values in every case. 

		unless (exists $Cache{ $RealUrl }) {

			($time,$size) = FetchUrl( $RealUrl );
	
			# Tally and store stats
			#
			$TotalSize += $size;
			$TotalTime += $time;
			$Cache{ $RealUrl } = { TIME => $time, SIZE => $size };
		}

		push @SummaryCache, $RealUrl;

		if ($Recursively or $DidRecurse) {

			# Store the whole recursive fetch with a special URL string. 
			#
			my $recursiveUrl =  "$url (incl.:$DidRecurse)";

			$Cache{$recursiveUrl} = { TIME => $TotalTime, SIZE => $TotalSize };

			push @SummaryCache, $recursiveUrl;
		}
	}
}


# ---------------------------------------------------------------------------
#
sub PrintFinalResults {

	my $Separator = "----------------------------------------------------\n";

	print $Separator if ($Verbose or $Debug);

	for my $url (@SummaryCache) {

			PrintUrl($url);
	}
}


# ---------------------------------------------------------------------------
# MAIN LOGIC
# ---------------------------------------------------------------------------

@Urls = GetParams();

# Set up a browser  
#
$Ua = LWP::UserAgent->new;
$Ua->agent("Mozilla (compatible: LWP $LWP::VERSION)");
$Ua->timeout(30);

# Set up a parser 
#
$Pa = HTML::LinkExtor->new (\&HandleParsedLink);

ExamineUrls( @Urls );

PrintFinalResults();

# ---------------------------------------------------------------------------
# END
# ---------------------------------------------------------------------------

1;
