#!/usr/bin/perl
######################################################################
# webgrab.pl
######################################################################
# Perl Power! - Michael Schilli 1998
######################################################################

use Getopt::Std;               # command line parameter catcher
use LWP::UserAgent;            # WWW utility
use HTML::TreeBuilder;         # HTML parser
use Archive::Tar;              # tar archiver
use URI::URL;                  # manipulate URLs

                               # define message functions
sub info { print STDERR @_ if $opt{v};}   # verbose mode output
sub err  { print STDERR @_; }             # error output

getopts("ef:ght:v", \%opt);    # get command line parameters
usage() if(defined $opt{h});   # help option set?
                               # without extract or get -> error
usage() unless grep {defined} ($opt{e}, $opt{g});

                               # create tar object
my $tar = Archive::Tar->new() if $opt{t};

if(defined $opt{f}) {          # fetch URLs from a  file ...
  push(@ARGV, $opt{f});        # simply append file
                               # to the command line
  while(<>) { chop; push(@urls, $_); }
} else {                       # ... or URLs from the command line
  push(@urls, @ARGV) || usage();
}

foreach $url (@urls) {         # all URLs are now in @urls
  info "# GET URL $url ... ";  # message

  $ua = LWP::UserAgent->new(); # create user agent

  $request = HTTP::Request->new('GET', $url); # create request
  $response = $ua->request($request);         # carry out network access

  if($response->is_error) {    # error check 
      err "ERROR code: ", $response->code(), 
            " Message: ", $response->message(), "\n";
  }

  $doc = $response->content(); # document OK
  info "OK\n";
                               # with -t option set:
  if($opt{t}) {                # do not output => tarfile
    my $path = URI::URL->new($url)->path;            # path from URL
    $path =~ s,/$,/index.html,g;       # without file name -> index.html
    $path =~ s,^/,,g;                  # strip leading '/'
    $tar->add_data($path, $doc);       # data into archive
    next;                              # process next URL
  }

  if($opt{g}) { print "$doc"; next; }  # without -t option 
                                       # simply output document

                               # extract links
  my $tree = HTML::TreeBuilder->new->parse($doc);

                                       # <A>, <AREA> and <IMG>
  for (@{$tree->extract_links(qw/a area img/)}) {
    my $l = URI::URL->new($_->[0]);    # href attribute
    ($s = $l->abs($url)) =~ s/#.*//g;  # URL absolute, #.. out
     print "$s\n" unless $links{$l}++; # output if new
  }

  $tree->delete();                     # delete parse tree
}

if($opt{t}) {
  $tar->write($opt{t});                # create tar file
  info "$opt{t} ready.\n";             # message in verbose mode
}

sub usage {
###############################################################
    $0 =~ s,.*/,,g;           # remove path

    print <<EOT;
usage: $0 -g [-f URLfile] [-t tarfile] URL ...  # get URLs
       $0 -e [-f URLfile] URL ...               # extract links
options:
       -h: help
       -v: verbose
EOT
    exit 1;
}

