print "UMass CMPSCI 546 Web Crawler Project\nAuthor: Dan Heller \n";
print "Arg1 - element to add to queue \n";
print "Arg2 - number of pages to crawl \n";
print "Arg3 - allowed crawling domain \n";
use LWP::RobotUA;
use HTML::LinkExtor;
use HTTP::Request;
use HTML::HeadParser;
$pagesCrawled = 0;
$queueStartPointer = 0;
if($#ARGV > -1 )
{
addToQueue($ARGV[0]);
}
$agent = new LWP::RobotUA 'UMass-cs546bot','dheller@edlab.cs.umass.edu';
$agent -> delay(.01);
open (CP,">>CrawledPages");
print CP ;
while($queueStartPointer <= $#url && $pagesCrawled < $ARGV[1])
{
print("Entering while loop \n");
$current = removeFromQueue();
print CP "\n";
print CP $current;
print("current: $current \n");
print CP "\n";
print CP "\n";
$currentURL = URI -> new($current);
$currentAuth = $currentURL -> authority();
print("\$currentURL -> authority: $currentAuth \n");
$currentScheme = $currentURL -> scheme();
print("\$currentScheme: $currentScheme \n");
$currentFragment = $currentURL -> fragment();
print("\$currentFragment: $currentFragment \n");
$currentPath = $currentURL -> path();
print("\$currentPath: $currentPath \n");
$currentRobotLoc = $currentScheme . "://" . $currentAuth .
"/robots.txt";
print("\$currentRobotLoc: $currentRobotLoc \n");
#$robotReq = HTTP::Request -> new(GET => "$currentRobotLoc");
#$res = $agent -> request($robotReq);
#$robotAllowed = 0;
# if robots.txt not found, assume its okay to proceed
#if($res -> is_success())
#{
# print("$currentRobotLoc found. VLoading robots.txt \n");
#$rules -> parse($res -> content);
#}
#else
#{
# print("$currentRobotLoc not found \n");
#}
$req = HTTP::Request -> new(GET => "$current");
$res = $agent -> request($req);
if($res -> is_success() ) {
print("success!\n");
# see if there is a robots meta tag header
# and save the tag in a temporary variable if there is
#my $headers = $res->headers();
#my $robotsTag = $headers -> header('Meta');
#print("\$robotsTag $robotsTag \n");
my $headParser = HTML::HeadParser -> new;
$headParser -> parse($res->content);
my $robots = $headParser -> header('X-Meta-Robots');
print("robots meta tag from head section: $robots \n");
my $noIndexMatch = $robots;
# if there is no meta tag forbidding indexing
# then save the contents of the page to the output file
if( $noIndexMatch =~ m/no[iI]ndex/)
{
print("received noIndex header! not even saving! \n");
}
else
{
print CP $res->content;
}
print CP "\n";
$le = HTML::LinkExtor->new(undef, $res->base);
$le -> parse($res -> content);
$le -> eof;
@links = $le -> links();
print CP "n";
# unless a robots meta tag disallows it
# save appropriate links into queue
if( $robots =~ m/no[fF]ollow/ )
{
print("received a noFollow header. not queueing links \n");
}
else
{
foreach(@links)
{
($tag,%l) = @{$_};
foreach(keys %l)
{
my $urllink = $l{$_}->canonical;
print CP "$urllink\n";
if($_ eq "href")
{
addToQueue($urllink);
}
print CP "$_ \n";
}
}
}
print CP "\n";
$pagesCrawled = $pagesCrawled + 1;
}
else {
print("failure... \n");
print($res -> error_as_HTML);
print("\n");
}
print("\$pagesCrawled: $pagesCrawled \n");
print("end while loop \n--------------- \n\n");
}
print CP "\n";
print CP $#url;
print CP "\n";
print CP " \n";
print("\n");
print("\$queueStartPointer: $queueStartPointer \n");
$queueSize = $#url - $queueStartPointer;
print("size of queue: $queueSize \n");
# parameter: one element to add to queue
# returns: true if succeeds, failure occurs
# when the domain is not allowed
sub addToQueue
{
print("in addToQueue \n");
if(checksOut($_[0]))
{
$url[$#url+1] = $_[0];
return 1;
}
else
{
return 0;
}
print("leaving addToQueue \n");
}
# no arguments, returns last element on queue
sub removeFromQueue
{
$retValue = $url[$queueStartPointer];
# $#url = $#url - 1;
$queueStartPointer = $queueStartPointer + 1;
return $retValue;
}
# argument: a url
# returns true if url matches first valid domain only
sub checksOut
{
print("Entering checksOut routine \n");
my $retValue = 0;
# strip off interior .. 's
$_[0] =~ s/\/\.\.//g;
# slashed[2] is the host name if the url is http://name.sub.domain/morestuff
@slashed = split('/',$_[0]);
if($slashed[2] =~ m/$ARGV[2]/)
{
print("valid \n");
$retValue = 1;
}
print("argument: ");
print("$_[0]");
print("\n");
my @dotSplit = split('\.',$_[0]);
my $suffix = $dotSplit[$#dotSplit];
print("suffix: ");
print($suffix);
print("\n");
if( ($suffix eq "html" ) && $retValue eq 1)
{
$retValue = 1;
}
else
{
$retValue = 0;
}
my $candidate = $_[0];
foreach(@url)
{
if($candidate eq $_)
{
$retValue = 0;
}
}
# RobotRules seems to have a bug in handling umass robots.txt
# so, skip for now
my $candidateURL = URI -> new($candidate);
if($candidateURL -> authority eq "www.cs.umass.edu")
{
print CP "removing cs.umass.edu from queue till bug is fixed \n";
$retValue = 0;
}
if($candidateURL -> scheme() ne "http")
{
print CP "non http scheme: ";
print CP $candidateURL->scheme();
print CP "\n";
$retValue = 0;
}
print("exiting checksOut subroutine: \$retValue -> $retValue \n");
return $retValue;
}