# NAME
WWW::Crawler::Lite - A single-threaded crawler/spider for the web.
# SYNOPSIS
my %pages = ( );
my $pattern = 'https?://example\.com\/';
my %links = ( );
my $downloaded = 0;
my $crawler;
$crawler = WWW::Crawler::Lite->new(
agent => 'MySuperBot/1.0',
url_pattern => $pattern,
http_accept => [qw( text/plain text/html application/xhtml+xml )],
link_parser => 'default',
on_response => sub {
my ($url, $res) = @_;
warn "$url contains " . $res->content;
$downloaded++;
$crawler->stop() if $downloaded++ > 5;
},
follow_ok => sub {
my ($url) = @_;
# If you like this url and want to use it, then return a true value:
return 1;
},
on_link => sub {
my ($from, $to, $text) = @_;
return if exists($pages{$to}) && $pages{$to} eq 'BAD';
$pages{$to}++;
$links{$to} ||= [ ];
push @{$links{$to}}, { from => $from, text => $text };
},
on_bad_url => sub {
my ($url) = @_;
# Mark this url as 'bad':
$pages{$url} = 'BAD';
}
);
$crawler->crawl( url => "http://example.com/" );
warn "DONE!!!!!";
use Data::Dumper;
map {
warn "$_ ($pages{$_} incoming links) -> " . Dumper($links{$_})
} sort keys %links;
# DESCRIPTION
`WWW::Crawler::Lite` is a single-threaded spider/crawler for the web. It can
be used within a mod_perl, CGI or Catalyst-style environment because it does not
fork or use threads.
The callback-based interface is fast and simple, allowing you to focus on simply
processing the data that `WWW::Crawler::Lite` extracts from the target website.
# PUBLIC METHODS
## new( %args )
Creates and returns a new `WWW::Crawler::Lite` object.
The `%args` hash is not required, but may contain the following elements:
- agent - String
Used as the user-agent string for HTTP requests.
__Default Value:__ - `WWW-Crawler-Lite/$VERSION $^O`
- url_pattern - RegExp or String
New links that do not match this pattern will not be added to the processing queue.
__Default Value:__ `https?://.+`
- http_accept - ArrayRef
This can be used to filter out unwanted responses.
- link_parser - String
Valid values: '`default`' and '`HTML::LinkExtor`'
The default value is '`default`' which uses a naive regexp to do the link parsing.
The upshot of using '`default`' is that the regexp will also find the hyperlinked
text or alt-text (of a hyperlinked img tag) and give that to your '`on_link`' handler.
__Default Value:__ `[qw( text/html text/plain application/xhtml+xml )]`
- on_response($url, $response) - CodeRef
Called whenever a successful response is returned.
- on_link($from, $to, $text) - CodeRef
Called whenever a new link is found. Arguments are:
- $from
The URL that is linked *from*
- $to
The URL that is linked *to*
- $text
The anchor text (eg: The HTML within the link - __This Text Here__)
- on_bad_url($url) - CodeRef
Called whenever an unsuccessful response is received.
- delay_seconds - Number
Indicates the length of time (in seconds) that the crawler should pause before making
each request. This can be useful when you want to spider a website, not launch
a denial of service attack on it.
## stop( )
Causes the crawler to stop processing its queue of URLs.
# AUTHOR
John Drago
# COPYRIGHT
This software is Free software and may be used and redistributed under the same
terms as perl itself.