crawler

print "UMass CMPSCI 546 Web Crawler Project\nAuthor: Dan Heller \n";
print "Arg1 - element to add to queue \n";
print "Arg2 - number of pages to crawl \n";
print "Arg3 - allowed crawling domain \n";

use LWP::RobotUA;
use HTML::LinkExtor;
use HTTP::Request;
use HTML::HeadParser;

$pagesCrawled = 0;
$queueStartPointer = 0;

if($#ARGV > -1 )
{
  addToQueue($ARGV[0]);
}

$agent = new LWP::RobotUA 'UMass-cs546bot','dheller@edlab.cs.umass.edu';

$agent -> delay(.01);


open (CP,">>CrawledPages");

print CP ;

while($queueStartPointer <= $#url && $pagesCrawled < $ARGV[1]) 
{

  print("Entering while loop \n");

  $current = removeFromQueue();

  print CP "\n";
  print CP $current;
  print("current: $current \n");
  print CP "\n";
  print CP "\n";

  $currentURL = URI -> new($current);
  $currentAuth = $currentURL -> authority();
  print("\$currentURL -> authority: $currentAuth \n");
  $currentScheme = $currentURL -> scheme();
  print("\$currentScheme: $currentScheme \n");
  $currentFragment = $currentURL -> fragment();
  print("\$currentFragment: $currentFragment \n");
  $currentPath = $currentURL -> path();
  print("\$currentPath: $currentPath \n"); 
  $currentRobotLoc = $currentScheme . "://" . $currentAuth .
                     "/robots.txt";
  print("\$currentRobotLoc: $currentRobotLoc \n");		     

  #$robotReq = HTTP::Request -> new(GET => "$currentRobotLoc");
  #$res = $agent -> request($robotReq);
  
  #$robotAllowed = 0;
  # if robots.txt not found, assume its okay to proceed
  #if($res -> is_success())
  #{
  #  print("$currentRobotLoc found. VLoading robots.txt \n");
    #$rules -> parse($res -> content);
  #}
  #else
  #{
  #  print("$currentRobotLoc not found \n");	  
  #}

  $req = HTTP::Request -> new(GET => "$current");
  
  $res = $agent -> request($req);

  if($res -> is_success() ) {
    print("success!\n");

    # see if there is a robots meta tag header
    # and save the tag in a temporary variable if there is
    #my $headers = $res->headers();
    #my $robotsTag = $headers -> header('Meta');
    
    #print("\$robotsTag $robotsTag \n");
    
    my $headParser = HTML::HeadParser -> new;
    $headParser -> parse($res->content);

    my $robots = $headParser -> header('X-Meta-Robots');
    print("robots meta tag from head section: $robots \n");
    
    my $noIndexMatch = $robots;
    
    # if there is no meta tag forbidding indexing
    # then save the contents of the page to the output file
    if( $noIndexMatch =~ m/no[iI]ndex/)
    {
	print("received noIndex header! not even saving! \n");
    }
    else
    {
      print CP $res->content;
    }
    print CP "\n";

    $le =  HTML::LinkExtor->new(undef, $res->base);
    $le -> parse($res -> content);
    $le -> eof;
    @links = $le -> links();
    print CP "n";
    
    # unless a robots meta tag disallows it
    # save appropriate links into queue
    
    if( $robots =~ m/no[fF]ollow/ )
    {
	print("received a noFollow header. not queueing links \n");    
    }
    else
    {
      foreach(@links)
      {
	  ($tag,%l) = @{$_};
	  foreach(keys %l)
	  {
                  my $urllink = $l{$_}->canonical;
                  print CP "$urllink\n";
		  if($_ eq "href")
		  {
		    addToQueue($urllink);
		  }
		  print CP "$_ \n";
	  }

      }
    }
    print CP "\n";

    $pagesCrawled = $pagesCrawled + 1;
  } 
  else {
    print("failure... \n");
    print($res -> error_as_HTML);
    print("\n");
  }



  print("\$pagesCrawled: $pagesCrawled \n"); 
  print("end while loop \n--------------- \n\n");

}

print CP "\n";
print CP $#url;
print CP "\n";
print CP " \n";
print("\n");
print("\$queueStartPointer: $queueStartPointer \n");
$queueSize = $#url - $queueStartPointer;
print("size of queue: $queueSize \n");

# parameter: one element to add to queue 
# returns: true if succeeds, failure occurs
#          when the domain is not allowed
sub addToQueue
{

  print("in addToQueue \n");
  if(checksOut($_[0]))
  {
    $url[$#url+1] = $_[0];
    return 1;
  }
  else
  {
    return 0;
  }
    
  print("leaving addToQueue \n");
}

#  no arguments, returns last element on queue
sub removeFromQueue
{
  $retValue = $url[$queueStartPointer]; 
  # $#url = $#url - 1; 
  $queueStartPointer = $queueStartPointer + 1;
  return $retValue; 

}

# argument: a url
# returns true if url matches first valid domain only
sub checksOut
{
	
  print("Entering checksOut routine \n");
	
  my $retValue = 0;
  
  # strip off interior .. 's
  $_[0] =~ s/\/\.\.//g;
	
  # slashed[2] is the host name if the url is http://name.sub.domain/morestuff
  @slashed = split('/',$_[0]);
  if($slashed[2] =~ m/$ARGV[2]/)
  {
     print("valid \n");
     $retValue = 1;	  
  }
  
  print("argument: ");
  print("$_[0]");
  print("\n");
  my @dotSplit = split('\.',$_[0]);
  my $suffix = $dotSplit[$#dotSplit];
  print("suffix: ");
  print($suffix);
  print("\n");
  
  if( ($suffix eq "html" ) && $retValue eq 1)
  {
    $retValue = 1;	  
  }
  else
  {
    $retValue = 0;	  
  }
  
  my $candidate = $_[0];
  foreach(@url)
  {
    if($candidate eq $_)
    {
	$retValue = 0;    
    }
  }
  
  # RobotRules seems to have a bug in handling umass robots.txt
  # so, skip for now
  my $candidateURL = URI -> new($candidate);
  if($candidateURL -> authority eq "www.cs.umass.edu")
  {
    print CP "removing cs.umass.edu from queue till bug is fixed \n";
    $retValue = 0;	  
  }
  
  if($candidateURL -> scheme() ne "http")
  {
    print CP "non http scheme: ";
    print CP $candidateURL->scheme();
    print CP "\n";
    $retValue = 0;	  
  }
  
  print("exiting checksOut subroutine: \$retValue -> $retValue \n");	
  return $retValue;	
	
}