#!/usr/bin/perl ########################################### # POE multi URL fetcher # Mike Schilli, 2002 (m@perlmeister.com) ########################################### use warnings; use strict; package POEFetch; use POE qw(Component::Client::HTTP); use Log::Log4perl qw(:easy); use Time::HiRes qw(gettimeofday tv_interval); our $CVSVERSION = '$Revision: 1.7 $'; ########################################### sub new { ########################################### my($class, %options) = @_; my $self = { start_time => 0, end_time => 0, elapsed => 0, urls => [], responses => [], max_kids => 5, timeout => 10, %options, }; bless $self, $class; } ########################################### sub register { ########################################### my($self, $request, $sn, $pw) = @_; if(defined $sn and defined $pw) { $request->authorization_basic($sn, $pw); } push @{$self->{urls}}, $request; } ########################################### sub process { ########################################### my($self) = @_; $self->{start_time} = [gettimeofday]; POE::Component::Client::HTTP->spawn( Agent => 'POEFetch/1.00', Alias => 'ua', Timeout => $self->{timeout}, ); POE::Session->create( package_states => [ref($self) => [qw(handle_response _stop _start)]], heap => { self => $self, active_kids => 0, max_kids => $self->{max_kids}, }, ); $poe_kernel->run(); $self->{end_time} = [gettimeofday]; $self->{elapsed} += tv_interval($self->{start_time}, $self->{end_time}); $self->{total_urls} += scalar @{$self->{responses}}; DEBUG("Fetching finished"); return @{$self->{responses}}; } ########################################### sub _stop { ########################################### my($self) = @_; # Nothing to do when the session ends } ########################################### sub _start { ########################################### delegate_work(@_); } ########################################### sub delegate_work { ########################################### my ($heap) = $_[HEAP]; while(@{$heap->{self}->{urls}} and $heap->{active_kids} < $heap->{max_kids}) { my $request = shift @{$heap->{self}->{urls}}; DEBUG("Posting request to ", $request->uri()); $poe_kernel->post( 'ua', 'request', 'handle_response', $request, ); $heap->{active_kids}++; } } ###################################################################### sub handle_response { ###################################################################### my ($heap, $request_packet, $response_packet) = @_[HEAP, ARG0, ARG1]; my $self = $heap->{self}; my($request, $response) = ($request_packet->[0], $response_packet->[0]); DEBUG("Got response for ", $request->uri()); $response->request($request); push @{$self->{responses}}, $response; $heap->{active_kids}--; delegate_work(@_); } ###################################################################### sub stats { ###################################################################### my($self) = @_; return ($self->{total_urls}, $self->{elapsed}); } 1; __END__ =head1 NAME POEFetch - Module for bulk-fetching URLs =head1 DOWNLOAD _SRC_HERE_ =head1 SYNOPSIS use POEFetch; use HTTP::Request::Common; my $f = POEFetch->new( max_kids => 5 ); $f->register(GET 'http://www.yahoo.com'); $f->register(GET 'http://www.amazon.com'); $f->register(GET('http://www.protected.com'), "user", "passwd"); my @results = $f->process(); for my $resp (@results) { if($resp->is_success()) { print $resp->request->uri(), ": ", $resp->content(), "\n"; } else { print $resp->request->uri(), ": ", $resp->code(), "\n"; } } =head1 DESCRIPTION B uses the smart/crazy POE framework to reel in URLs en masse. POE is a nice alternative to pre-forking or multi-threading. The POE framework relies on cooperative multi-tasking and runs its tasks in a single-process and single-threaded mode and takes advantage of the fact that there's free processor cycles while we're waiting for slower operations (like waiting for a network response). C works both with plain HTTP and SSL. Also, basic authentication is supported. =head1 METHODS =over 4 =item my $fetcher = POEFetcher-Enew( [max_kids => $nof_kids] ); Constructor, creates the fetcher object, capable of registering and then reeling in an arbitrary number of Cs. It accepts optional parameter settings, currently the number of requests processed in parallel can be provided in C: my $fetcher = POEFetcher->new( max_kids => 10 ); will cause the POEFetcher to run 10 requests in parallel. The default is C<5>. The maximum throughput has to be determined empirically, reasonable values for C have been found to be between 5 and 10. =item $f-Eregister( $req, [$user, $pass] ); Register a request with the fetcher. C<$req> is of type C and created easily with the C module's C and C methods. C<$user> and C<$pass> are optional and are used for basic authentication if provided. =item my @results = $f-Eprocess(); Fires up all registered requests in a quasi-parallel way (but never more than C (see C) at a time. The function will block until the last URL has been reeled in. Every element of the result array is of type C and can be queried for success/failure with its standard C/C methods. Please note that the C used internally won't follow C<302>s automatically but return them as errors. Results are stored in the same order as the original requests were registered in. But just in case you lost track of the URLs requested in the first place, every C object provides the original URL in C<$resp-Erequest()-Eurl()>. =back =head1 LEGALESE Copyright 2002 by Mike Schilli, all rights reserved. This program is free software, you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR 2001, Mike Schilli