#!/usr/bin/perl

##################################################################
# Author:  Brent Hughes
#
# Date:    11/8/03
#
# Program: rget-links.pl
#
# Purpose: This is a program to retrieve links from web pages
#          recursively. This may not seem terribly useful at
#          first, but by feeding the output to grep one can
#          easily acquire a list of files matching a certain
#          pattern. Automate lwp-download over the resulting 
#          list and you'll see my point. This method can easily
#          be used to aquire thousands of movie files and images
#          or whatever else you can think of.
#
# Bugs:    Please report any bugs to brent_hughes_@hotmail.com
#          Also feel free to comment on the program's function
#          and/or propose additional features.
#
##################################################################

use warnings;
use strict;

package RGetLinks;

use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use Getopt::Long;

$| = 1;

# global data for this program
my $depth;
my %files;

# command line options
my $opt_depth = 3;

# retrieve command line options
my $options = GetOptions ("depth=i" => \$opt_depth);  # numeric

# acquire url from command line
my $url = shift;

# abort if the options are improperly formatted
if(!defined $url){ usage(); }

# program enters actual processing at this point
rgetlinks($url,$opt_depth);


#################################################################
#  Subroutines


# A routine to get links recursively 
sub rgetlinks
{
	my($url,$maxdepth) = @_;
	chomp($url);

	# initialize globals
	$depth = 0;
	%files = ();

	# descend
	rgetlinkshelper($url,$maxdepth);		
}

# A helper routine to get links recursively 
sub rgetlinkshelper
{
	my($url,$maxdepth) = @_;

	# return if too deep or already been here
	if($depth >= $maxdepth || defined $files{$url})
	{   
		return;
	}
	else
	{
		# drop down a level and add the file to the hash
		$depth++; $files{$url} = 1;

		# show our current location		
		foreach(1..$depth) {print ' ';}
		print $url, "\n";

		# retrieve all links
		my @links = getlinks($url);

		# recursive step
		foreach(@links){ rgetlinkshelper($_,$maxdepth); } 

		# pop up a level
		$depth--;
	}
}

# A routine to return all links from a URL
# This routine was borrowed almost verbatim from an example program.
# However, I did optimize it to only retrieve links from text/html
# files. The program was trying to retrieve links from large movie 
# files. That didn't work to well and took up a lot of computation time.

my @links = ();

sub getlinks
{
	my($url) = @_;  # for instance
	my $ua = new LWP::UserAgent;
	
	# Make the parser.  Unfortunately, we don't know the base yet
	# (it might be diffent from $url)
	@links = ();
	my $p = HTML::LinkExtor->new(\&callback);

	# Look at the header to determine what type of document we have
	my $headreq = HTTP::Request->new(HEAD => $url);
	my $headres = $ua->request($headreq); 
	my $type    = $headres->header('content-type');

	# only parse the document for links if it is a text or html document
	if(defined $type && $type =~ /text|html/)
	{
		# Request document and parse it as it arrives
		my $getreq = HTTP::Request->new(GET => $url);
		my $getres = $ua->request($getreq, sub{ $p->parse($_[0])});

		# Expand all URLs to absolute ones
		my $base = $getres->base;
		@links = map { $_ = url($_, $base)->abs; } @links;
	}
	
	# Return the links
	return @links;
}

# Set up a callback that collects links
sub callback {
	my($tag, %attr) = @_;

	return if $tag ne 'a';  # we only look closer at <a ...>
	push(@links, values %attr);
}

# A routine to provide instructions
sub usage
{
	# strip the progname with a regex
	my $progname = $0;
	$progname =~ s/(.*\\|.*\/)(.*)/$2/g;

	# show instructions
	print   "\nUsage:\n\t\t", 
		$progname, " [args] target-url > output-file\n\n",
		"Example:\n\t\t", 
		$progname, " --depth=3 http://www.perl.org\n\n";

	print   "Options\n", "=======\n", 
		
		"--depth\t\t", 
		"The maximum depth of links to traverse (default = 3)\n";

	exit();
}
