O'Reilly Hacks
oreilly.comO'Reilly NetworkSafari BookshelfConferences Sign In/My Account | View Cart   
Book List Learning Lab PDFs O'Reilly Gear Newsletters Press Room Jobs  


 
Buy the book!
Yahoo! Hacks
By Paul Bausch
October 2005
More Info

HACK
#19
Spider the Yahoo! Catalog
Writing a spider to spider an existing spider's site may seem convoluted, but it can prove useful when you're looking for location-based services. This hack walks through creating a framework for full-site spidering, including additional filters to lessen your load
The Code
[Discuss (0) | Link to this hack]

The Code

Save the following code to a file called yspider.pl:

	#!/usr/bin/perl -w
	#
	# yspider.pl
	#
	# Yahoo! Spider--crawls Yahoo! sites, collects links from each downloaded HTML
	# page, searches each downloaded page and prints a list of results when done.
	# http://www.artymiak.com/software/ or contact jacek@artymiak.com
	#
	# This code is free software; you can redistribute it and/or
	# modify it under the same terms as Perl itself.
	use strict;
	use Getopt::Std;			# parse command line options.
	use LWP::UserAgent;			# download data from the net.
	use HTML::LinkExtor;		# get links inside an HTML document.
	use URI::URL;				# turn relative links into absolutes.

	my $help = <<"EOH";
	------------------------------------------------------------------------
	Yahoo! Spider.

	Options: -s		list of sites you want to crawl,
				e.g. -s 'us china denmark'
			 -h		print this help

	Allowed values of -s are:

	argentina, asia, australia, brazil, canada,
	catalan, china, denmark, france, germany, hongkong,
	india, ireland, italy, japan, korea, mexico,
	newzealand, norway, singapore, spain, sweden, taiwan,
	uk, us, us_chinese, us_spanish
	Please, use this code responsibly. Flooding any site
	with excessive queries is bad net citizenship.
	------------------------------------------------------------------------
	EOH

	# define our arguments and
	# show the help if asked.
	my %args; getopts("s:h", \%args);
	die $help if exists $args{h};
	
	# The list of code names, and
	# URLs, for various Yahoo! sites.
	my %ys = (
		argentina => "http://ar.yahoo.com", asia => "http://asia.yahoo.com",
		australia => "http://au.yahoo.com", newzealand => "http://au.yahoo.com",
		brazil => "http://br.yahoo.com", canada => "http://ca.yahoo.com",
		catalan => "http://ct.yahoo.com", china => "http://cn.yahoo.com",
		denmark => "http://dk.yahoo.com", france => "http://fr.yahoo.com",
		germany => "http://de.yahoo.com", hongkong => "http://hk.yahoo.com",
		india => "http://in.yahoo.com", italy => "http://it.yahoo.com",
		korea => "http://kr.yahoo.com", mexico => "http://mx.yahoo.com",
		norway => "http://no.yahoo.com", singapore => "http://sg.yahoo.com",
		spain => "http://es.yahoo.com", sweden => "http://se.yahoo.com",
		taiwan => "http://tw.yahoo.com", uk => "http://uk.yahoo.com",
		ireland => "http://uk.yahoo.com", us => "http://www.yahoo.com",
		japan => "http://www.yahoo.co.jp",
		us_chinese => "http://chinese.yahoo.com",
		us_spanish => "http://espanol.yahoo.com"
	);

	# if the -s option was used, check to make
	# sure it matches one of our existing codes
	# above. if not, or no -s was passed, help.
	my @sites; # which locales to spider.
	if (exists $args{'s'}) {
		@sites = split(/ /, lc($args{'s'}));
		foreach my $site (@sites) {
			die "UNKNOWN: $site\n\n$help" unless $ys{$site};
		}
	} else { die $help; }

	# Defines global and local profiles for URLs extracted from the
	# downloaded pages. These profiles are used to determine if the
	# URLs extracted from each new document should be placed on the
	# TODO list (%todo) or rejected (%rejects). Profiles are lists
	# made of chunks of text, which are matched against found URLs.
	# Any special characters, like slash (/) or dot (.) must be properly
	# escaped. Remember that globals have precedence over locals.
	my %rules = (
		global => { allow => [], deny => [ 'search', '\*' ] },
		argentina => { allow => [ 'http:\/\/ar\.' ], deny => [] },
		asia => { allow => [ 'http:\/\/(aa|asia)\.' ], deny => [] },
		australia => { allow => [ 'http:\/\/au\.' ], deny => [] },	
		brazil => { allow => [ 'http:\/\/br\.' ], deny => [] },
		canada => { allow => [ 'http:\/\/ca\.' ], deny => [] },
		catalan => { allow => [ 'http:\/\/ct\.' ], deny => [] },
		china => { allow => [ 'http:\/\/cn\.' ], deny => [] },	
		denmark => { allow => [ 'http:\/\/dk\.' ], deny => [] },
		france => { allow => [ 'http:\/\/fr\.' ], deny => [] },
		germany => { allow => [ 'http:\/\/de\.' ], deny => [] },
		hongkong => { allow => [ 'http:\/\/hk\.' ], deny => [] },
		india => { allow => [ 'http:\/\/in\.' ], deny => [] },
		ireland => { allow => [ 'http:\/\/uk\.' ], deny => [] },
		italy => { allow => [ 'http:\/\/it\.' ], deny => [] },
		japan => { allow => [ 'yahoo\.co\.jp' ], deny => [] },
		korea => { allow => [ 'http:\/\/kr\.' ], deny => [] },
		mexico => { allow => [ 'http:\/\/mx\.' ], deny => [] },
		norway => { allow => [ 'http:\/\/no\.' ], deny => [] },
		singapore => { allow => [ 'http:\/\/sg\.' ], deny => [] },	
		spain => { allow => [ 'http:\/\/es\.' ], deny => [] },
		sweden => { allow => [ 'http:\/\/se\.' ], deny => [] },
		taiwan => { allow => [ 'http:\/\/tw\.' ], deny => [] },
		uk => { allow => [ 'http:\/\/uk\.' ], deny => [] },
		us => { allow => [ 'http:\/\/(dir|www)\.' ], deny => [] },
		us_chinese => { allow => [ 'http:\/\/chinese\.' ], deny => [] },
		us_spanish => { allow => [ 'http:\/\/espanol\.' ], deny => [] },
	);
	my %todo = ( ); # URLs to parse
	my %done = ( ); # parsed/finished URLs
	my %errors = ( ); # broken URLs with errors.
	my %rejects = ( ); # URLs rejected by the script
	
	# print out a "we're off!" line, then
	# begin walking the site we've been told to.
	print "=" x 80 . "\nStarted Yahoo! spider…\n" . "=" x 80 . "\n";
	our $site; foreach $site (@sites) {

		# for each of the sites that have been passed on the
		# command line, we make a title for them, add them to
		# the TODO list for downloading, then call walksite( ),
		# which downloads the URL, looks for more URLs, etc.
		my $title = "Yahoo! " . ucfirst($site) . " front page";
		$todo{$ys{$site}} = $title; walksite( ); # process.
	}

	# once we're all done with all the URLs, we print a
	# report about all the information we've gone through.
	print "=" x 80 . "\nURLs downloaded and parsed:\n" . "=" x 80 . "\n";
	foreach my $url (keys %done) { print "$url => $done{$url}\n"; }
	print "=" x 80 . "\nURLs that couldn't be downloaded:\n" . "=" x 80 . "\n";
	foreach my $url (keys %errors) { print "$url => $errors{$url}\n"; }
	print "=" x 80 . "\nURLs that got rejected:\n" . "=" x 80 . "\n";
	foreach my $url (keys %rejects) { print "$url => $rejects{$url}\n"; }

	# this routine grabs the first entry in our todo
	# list, downloads the content, and looks for more URLs.
	# we stay in walksite until there are no more URLs
	# in our to do list, which could be a good long time.
	sub walksite {
	
		do {
			# get first URL to do.
			my $url = (keys %todo)[0];
	
			# download this URL
			print "-> trying $url…\n";
			my $browser = LWP::UserAgent->new;
			my $resp = $browser->get( $url, 'User-Agent' => 'Y!SpiderHack/1.0' );

			# check the results.
			if ($resp->is_success) {
				my $base = $resp->base || '';
				print "-> base URL: $base\n";
				my $data = $resp->content; # get the data.
				print "-> downloaded: " . length($data) . " bytes of $url\n";

				# find URLs using a link extorter. relevant ones
				# will be added to our to do list of downloadables.
				# this passes all the found links to findurls( )
				# below, which determines if we should add the link
				# to our to do list, or ignore it due to filtering.
				HTML::LinkExtor->new(\&findurls, $base)->parse($data);
				
				###########################################################
				# add your own processing here. perhaps you'd like to add #
				# a keyword search for the downloaded content in $data? #
				###########################################################

		} else {
			$errors{$url} = $resp->message( );
			print "-> error: couldn't download URL: $url\n";
			delete $todo{$url};
		}
		
		# we're finished with this URL, so move it from
		# the to do list to the done list, and print a report.
		$done{$url} = $todo{$url}; delete $todo{$url};
		print "-> processed legal URLs: " . (scalar keys %done) . "\n";
		print "-> remaining URLs: " . (scalar keys %todo) . "\n";
		print "-" x 80 . "\n";
	} until ((scalar keys %todo) == 0);
}

# callback routine for HTML::LinkExtor. For every
# link we find in our downloaded content, we check
# to see if we've processed it before, then run it
# through a bevy of regexp rules (see the top of
# this script) to see if it belongs in the to do.
sub findurls {
	my($tag, %links) = @_;
	return if $tag ne 'a';
	return unless $links{href};
	print "-> found URL: $links{href}\n";

	# already seen this URL, so move on.
	if (exists $done{$links{href}} ||
		exists $errors{$links{href}} ||
		exists $rejects{$links{href}}) {
		print "--> I've seen this before: $links{href}\n"; return;
	}

	# now, run through our filters.
	unless (exists($todo{$links{href}})) {
		my ($ga, $gd, $la, $ld); # counters.
		foreach (@{$rules{global}{'allow'}}) { $ga++ if $links{href} =~
/$_/i; }
		foreach (@{$rules{global}{'deny'}}) { $gd++ if $links{href} =~

/$_/i; }
		foreach (@{$rules{$site}{'allow'}}) { $la++ if $links{href} =~ 
/$_/i; }
		foreach (@{$rules{$site}{'deny'}}) { $ld++ if $links{href} =~ /$_/i; }

		# if there were denials or NO allowances, we move on.
		if ($gd or $ld) { print "-> rejected URL: $links{href}\n"; return; }
		unless ($ga or $la) { print "-> rejected URL: $links{href}\n"; 
return; }
		# we passed our filters, so add it on the barby.
		print "-> added $links{href} to my TODO list\n";
		$todo{$links{href}} = $links{href};
	}
}


O'Reilly Home | Privacy Policy

© 2007 O'Reilly Media, Inc.
Website: | Customer Service: | Book issues:

All trademarks and registered trademarks appearing on oreilly.com are the property of their respective owners.