LJ Archive

Listing 4. download-recursively.pl

#!/usr/bin/perl -w
use strict;
use diagnostics;
use LWP::RobotUA;
use HTML::LinkExtor;
use URI::URL;
# Where do we want to start?
my $origin = "http://www.lerner.co.il/";
# Create two hashes, one for URLs we have seen and
# another for those we haven't
my %already_retrieved = ();
my %to_be_retrieved = ($origin => 1);
# Create a hash in which we will deposit URLs we
# wish to use
my %interesting_urls = ();
# Create a new user agent
my $ua = new LWP::RobotUA "ATF/1.0",
   'reuven@lerner.co.il';
$ua->delay(0);
while (my $url = (keys %to_be_retrieved)[0])
{
    # Create a new HTTP request
    my $request = new HTTP::Request('GET', $url);
    # Move this URL from %to_be_retrieved to
    # %already_retrieved
    delete $to_be_retrieved{$url};
    $already_retrieved{$url} = 1;
    # Indicate what we are retrieving
    print "Retrieving $url...";
    # Hand $request to $ua, and get an HTTP
    # response back in return
    my $response = $ua->request($request);
    # Complete printout
    print ".\n";
    # If there was a problem, send a report to
    # STDERR
    if (!$response->is_success)
    {
   print STDERR qq{Error retrieving "$url":},
       $response->status_line , "\n";
    }
    # Create an instance of HTML::LinkExtor,
    # making links relative to $url
    my $parser = HTML::LinkExtor->new(\&callback,
       $url);
    # Parse the output
    $parser->parse($response->content);
}
#---------------------------------------------
# Define our callback, which is passed scalar and
# a hash
sub callback
{
    # Get the tag and its associated attributes
    my ($tag, %attributes) = @_;
    # We only care about anchor tags
    return unless ($tag eq "a");
    # Iterate through the attributes
    foreach my $name (sort keys %attributes)
    {
   # If this is a link, then put the URL in the
   # queue if ($name eq "href")
   {
       # Get the URL for this anchor
       my $url = $attributes{$name};
       # Ignore non-HTTP URLs
       next unless ($url =~ m/^http:/);
       # Indicate that we should retrieve this
       # URL in the future
       $to_be_retrieved{$url} = 1
      unless $already_retrieved{$url};
   }
    }
}
LJ Archive