#! D:/Perl/bin/perl.exe

use strict;
use warnings;
use URI;
use Encode;
use LWP::Simple;
use Web::Scraper;
use HTML::TreeBuilder;
use constant base => 'http://hi.baidu.com/';

sub clear_name {
	$_ =~ s/(\s|\"|\'|\?|\*|\:|\||<|>)//g; 
	$_ =~ s/(\/|\\)/_/g;
}

@ARGV = ( '°®ÐÄÍ¬ÃË_³ÂöÎ' );
if (@ARGV != 1) {
	print "Usage: perl  hi_baidu.pl  user_id\n";
	exit(1);
}

my $id = shift @ARGV;  # fandywang_jlu
my $url	= base . $id . '/blog';
my $article_per_page = 5;

# id : #
# class : .

my $worker = scraper {
	process "#m_artclg .item"  ,	"category_num[]"	 => 'TEXT';
	process "#m_artclg .item a",	"category_link[]"    => '@href',	"category_name[]" => 'TEXT';
	process "#m_blog .tit a"   ,	"_article_per_page[]"=> 'TEXT';
};

print "start downloading category info... ";
my $res = $worker->scrape( URI->new($url) );
print "done\n";

$article_per_page = @{ $res->{_article_per_page} };
if ($article_per_page > 10)		{ $article_per_page = 15; } 
elsif ($article_per_page > 5)	{ $article_per_page = 10; }
print "article_per_page: ", $article_per_page, "\n";

my @category_name = ();
foreach (@{ $res->{category_name} }) {
	push(@category_name, encode("gb2312", $_));
}

my @category_num = ();
foreach (@{ $res->{category_num} }) {
	#print encode("gb2312", $_ . "\n");
	if (m/\((\d+)\)/si) {
		push( @category_num, int $1 / $article_per_page);
	} else {
		print "category_num extract error!\n";
	}
}

mkdir($id, 0007);
foreach (@category_name) {
	clear_name();
	mkdir("$id/$_", 0007);
}


eval {
foreach (@category_name) {
	my $cnt = shift @category_num;
	my $cate_link = shift @{ $res->{category_link} };
	for my $i (0 .. $cnt) {
		my $link = $cate_link . "/index/" . $i;
		$worker = scraper {
			process "#m_blog .tit a", "page_link[]" => '@href', "title[]" => 'TEXT';
		}; 
		my $page_res = $worker->scrape( URI->new($link) );

		my @title = ();
		foreach (@{ $page_res->{title} }) {
			push(@title, encode("gb2312", $_));
		}

		my $cate = $_;
		foreach (@title) {
			clear_name();
			print "$id/$cate/$_.html\n";
			open OUT, ">$id/$cate/$_.html"	or die "can't write $id/$cate/$_.html: $!.\n";
			# print ${ $page_res->{page_link} }[0], "\n";
			my $html = get(shift @{ $page_res->{page_link} });
			print OUT encode("gb2312", $html);
		}
	}
}

}