O'Reilly Hacks
oreilly.comO'Reilly NetworkSafari BookshelfConferences Sign In/My Account | View Cart   
Book List Learning Lab PDFs O'Reilly Gear Newsletters Press Room Jobs  


 
Buy the book!
Spidering Hacks
By Kevin Hemenway, Tara Calishain
October 2003
More Info

HACK
#58
Scraping Alexa's Competitive Data with Java
Alexa tracks the browsing habits of its millions of users daily. This hack allows you to aggregate the traffic statistics of multiple web properties into one RSS file, with subscriptions available daily
The Code
[Discuss (2) | Link to this hack]

The Code

Report.java

package alexa;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;

/** Main class of the alexa package.  Gathers data and composes the message.
 * @author Niall Kennedy
 * @version 1.0
 */
public final class Report {
    private final DecimalFormat NUMBER_PRETTY = new DecimalFormat("#,##0");
    private final String url;
    private String generated;
    private String name;
    private StringBuffer body;
    private TrafficBean bean;

    public static void main(String [] args) {
        String [] sites = null;
        if (args.length>0) {
	    sites = args;
        }
        else {
            // use a default set of sites
	    sites = {"oreilly.com", "manning.com",
	             "osborne.com", "wrox.com"};
	}
	for (int i=0; i<sites.length; ++i) {
	    Report r = new Report("http://" + sites[i]);
	    r.collectData();
	    r.writeData();
            r = null;
        }
        sites = null;
    }

    /** It all starts here.
     *
     * @param url fully qualified Web address (example: http://www.google.com)
     */
    public Report(final String url) {
        this.url = url.toLowerCase().trim();
        System.out.println("Processing " + url);
        name = url;
        body = new StringBuffer();
        bean = new TrafficBean();
        generated = null;
    }

    /** Gather data from various classes and compose the body of the message
     */
    public void collectData() {
        Website web = new Website(url);
        Parse p = new Parse(Website.bodyFilter(web.retrieveSource()));
        generated = web.getHeaderDate();
        p.run();
        name = p.getTitle();
        bean = p.getBean();
        body.append("<h1>Alexa Traffic Report for:</h1>");
        body.append("<p>");
        if (name!=null && !name.equalsIgnoreCase(url)) {
            body.append("<strong>");
            body.append(name).append("</strong><br />");
        }
        body.append("<a href=\"");
        body.append(url).append("\">");
        body.append(url).append("</a></p><br />");
        body.append(showDestinations());
        body.append(showReach());
        body.append(showViews());
        p = null;
        web = null;
    }

    /** Writes data to RSS file
     */
    public void writeData() {
        RSSWriter rss = new RSSWriter(name, body.toString(), url, generated);
        rss.run();
        rss = null;
    }

    /** Show the top subdomains in order of decending popularity
     *
     * @return a paragraph detailing subdomain activity, or an empty <code>String</code> if none exist
     */
    private String showDestinations() {
        StringBuffer retval = new StringBuffer();
        Map sites = bean.getSites();
        if (sites!=null && sites.size()>0) {
            retval.append("<a href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#web_hosts\"><font size=\"+1\">Most Popular Subdomains</font></a><br />");
            ArrayList keys = new ArrayList(sites.keySet());
            Collections.sort(keys);
            for(int i=keys.size()-1; i>0; --i) {
                int pct = Integer.parseInt(keys.get(i).toString());
                String site = sites.get(new Integer(pct)).toString();
                retval.append(pct);
                retval.append(" %  --  ");
                if (site.equalsIgnoreCase("Other websites")) {
                    retval.append(site);
                }
                else {
                    retval.append("<a href=\"http://");
                    retval.append(site);
                    retval.append("\">");
                    retval.append(site);
                    retval.append("</a>");
                }
                retval.append("<br />");
                site = null;
            }
            keys = null;
        }
        sites = null;
        retval.append("<br />");
        return retval.toString();
    }

    /** Show the total reach of the domain
     *
     * @return paragraph detailing reach per million and database rank,
     * or empty <code>String if no data available
     */
    private String showReach() {
        StringBuffer retval = new StringBuffer();
        int reach = bean.getReachPerMillion();
        int reach_rank = bean.getReachRank();
        if (reach>0 || reach_rank>0) {
            retval.append("<a name=\"Learn More\" href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#reach\"><font size=\"+1\">Domain Reach</font></a><br />Reach per million : ");
            retval.append(NUMBER_PRETTY.format(reach));
            retval.append("<br />");
            retval.append("Reach Rank        : ");
            retval.append(NUMBER_PRETTY.format(reach_rank));
            retval.append("<br /><br />");
        }
        return retval.toString();
    }

    /** <p>Paragraph detailing how many pages an average viewer navigates to
     *  within the domain and how total views compare to the entire database of domains.</p>
     *
     * @return paragraph detailing a site's popularity and depth
     */
    private String showViews() {
        StringBuffer retval = new StringBuffer();
        int views = bean.getViewsPerUser();
        int views_rank = bean.getViewsRank();
        if (views>0 || views_rank>0) {
            retval.append("<a name=\"Learn More\" href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#page_views\"><font size=\"+1\">Page Views</font></a><br />Page Views Per User : ");
            retval.append(NUMBER_PRETTY.format(views));
            retval.append("<br />Page Views Rank      : ");
            retval.append(NUMBER_PRETTY.format(views_rank));
        }
        return retval.toString();
    }
}

Website.java

package alexa;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.HttpURLConnection;

/** Opens a HTTP connection and pulls the source code of Alexa's traffic detail site
 *
 * @author Niall Kennedy
 * @version 1.0
 */
public final class Website {
    private final String BASEURL="http://www.alexa.com/data/details/traffic_details?url=";
    private final String url_location;
    private String header_date;

    /**
     *
     * @param url parameter URL of a Web domain.  ex: http://domain.tld
     */
    public Website(final String url) {
        this.url_location =  BASEURL + url;
    }

    public String getURLLocation() {
        return url_location;
    }

    public String getHeaderDate() {
        return header_date;
    }

    /** Retrieves the full source of the requested page.
     * Sets header_date to date supplied by the server
     *
     * @return full source code of the file located at url_location
     */
    public String retrieveSource() {
        StringBuffer source = new StringBuffer();
        try {
            URL u = new URL(getURLLocation());
            HttpURLConnection connect = (HttpURLConnection) u.openConnection();
            // Masquerade as a common Web browser request
            connect.setRequestProperty("User-Agent", "Mozilla/5.0");
            connect.setUseCaches(false);
            header_date = connect.getHeaderField("Date");
            BufferedReader html = new BufferedReader(new InputStreamReader(connect.getInputStream()));
            String line = null;
            while ((line=html.readLine())!=null) {
                source.append(line);
            }
            line = null;
            html.close();
            html = null;
            u = null;
            connect.disconnect();
            connect = null;
        }
        catch (MalformedURLException e) {
            source = null;
        }
        catch (IOException e) {
            System.err.println("I/O Error");
            System.err.println(url_location);
            e.printStackTrace();
        }
        if (source==null) {
            return null;
        }
        else {
            return source.toString();
        }
    }

    /** Given the complete source, return only the HTML body
     *
     * @param fullpage complete source code of the provided destination
     * @return body of the HTML page, or null if lowercase body tags not found
     */
    public static String bodyFilter(final String fullpage) {
        if (fullpage==null) {
            return null;
        }
        String retval = null;
        String start_tag = "<body";
        String end_tag = "body>";
        int start = fullpage.indexOf(start_tag);
        int end = fullpage.lastIndexOf(end_tag);
        if (start>=0 && end>0) {
            retval = fullpage.substring(start, end+end_tag.length());
        }
        return retval;
    }
}

Parse.java

package alexa;

import java.util.Collections;
import java.util.Hashtable;
import java.util.Map;

/** Parse a HTML <code>String</code> representing Alexa's traffic page
 * @author Niall Kennedy
 * @version 1.0
 */
public final class Parse {
    /** entire text to parse */
    private final String document;
    /** Company associated with the given domain */
    private String title;
    /** Parsed data goes here */
    private TrafficBean bean;
    /** speed up the String search by keeping track of the areas already combed over */
    private int place;

    /**
     * @param document source code to parse through
     */
    public Parse(final String document) {
        this.document = document;
        bean = new TrafficBean();
        title = null;
        place = 0;
    }

    public void run() {
        setTitle();
        System.out.println(title);
        bean.setSites(getSiteDomains());
        bean.setReachPerMillion(getTodayStat());
        bean.setReachRank(getTodayStat());
        bean.setViewsPerUser(getTodayStat());
        bean.setViewsRank(getTodayStat());
    }

    public String getTitle() {
        return title;
    }

    public TrafficBean getBean() {
        return bean;
    }

    /** Narrow down the length of a search string by defining identifying text occuring after the place index
     *
     * @param start_text
     * @param end_text
     * @return
     */
    private String snipIt(final String start_text, final String end_text) {
        String retval = null;
        try {
            int start_position = document.indexOf(start_text, place);
            if (start_position>=place) {
                int end_position = document.indexOf(end_text, start_position);
                if (end_position>start_position) {
                    retval = document.substring(start_position, end_position);
                    place = end_position;
                }
            }
        }
        catch (Exception e) {}
        return retval;
    }

    private void setTitle() {
        String start_tag = "<span class=\"title\">";
        String end_tag = "</span>";
        try {
            title = snipIt(start_tag, end_tag);
        }
        catch (Exception e) {}
        start_tag = null;
        end_tag = null;
    }

    private Map getSiteDomains() {
        Hashtable retval = new Hashtable();
        String snip = snipIt("<span class=\"titleO\">Where do people go on", "<hr size=\"1\">");
        // cycle through the list of subdomains
        for (int start = snip.indexOf("<li>");
             start>0;
             start = snip.indexOf("<li>", start)) {
            int end = snip.indexOf("~", start);
            if (end>0) {
                try {
                    String site = snip.substring(start+4, end).trim();
                    start = snip.indexOf("<b>", end);
                    if (start>0) {
                        // grab the number only
                        end = snip.indexOf("%</b>", start);
                        int pct = Integer.parseInt(snip.substring(start+3,end));
                        retval.put(new Integer(pct), site);
                    }
                    site = null;
                }
                catch (Exception e) {}
            }
        }
        snip = null;
        return Collections.unmodifiableMap(retval);
    }

    /** Each table is formatted the same, so we can reuse the same method on each.
     *
     * @return today's statistic (row 1, column 1)
     */
    private String getTodayStat() {
        String retval = null;
        String snip = snipIt("<table", "</table>");
        String tag = "</tr><tr><td class=\"bodyBold\" align=\"center\" bgcolor=\"#ffffff\">";
        try {
            int start = snip.indexOf(tag);
            if (start>0) {
                start += tag.length();
                int end = snip.indexOf("</td>", start);
                retval = snip.substring(start, end);
            }
        }
        catch (Exception e) {}
        snip = null;
        return retval;
    }
}

TrafficBean.java

package alexa;

import java.text.NumberFormat;
import java.util.Map;

/** Standard entity bean style.  Good for holding related data
 *
 * @author Niall Kennedy
 * @version 1.0
 */
public final class TrafficBean {
    /** Adds commas to large numbers.
     */
    public final NumberFormat NUMBER_PURIFY = NumberFormat.getInstance();
    /** subdomains. key of percentage expressed as an <code>Integer</code> and
     *  value of property name */
    private Map sites;
    /** The percentage of one million Internet users who visit the given site */
    private int reach_per_million;
    /** A ranking of all sites based solely on their reach */
    private int reach_rank;
    /** The number of pages viewed by Alexa Toolbar users.
     * Multiple page views of the same page made by the same user on the same day are counted only once.*/
    private int views_per_user;
    /** a ranking of all sites based solely on the total number of page views (not page views per user) */
    private int views_rank;

    /** Initialize all variables
     */
    public TrafficBean() {
        clearAll();
    }

    /** clear all class variables
     */
    public void clearAll() {
        sites = null;
        reach_per_million = 0;
        reach_rank = 0;
        views_per_user = 0;
        views_rank = 0;
    }

    public Map getSites() {
        return sites;
    }

    public void setSites(final Map val) {
        this.sites = val;
    }

    public int getReachPerMillion() {
        return reach_per_million;
    }

    public void setReachPerMillion(final String val) {
        reach_per_million = stringToInt(val);
    }

    public int getReachRank() {
        return reach_rank;
    }

    public void setReachRank(final String val) {
        reach_rank = stringToInt(val);
    }

    public int getViewsPerUser() {
        return views_per_user;
    }

    public void setViewsPerUser(final String val) {
        views_per_user = stringToInt(val);
    }

    public int getViewsRank() {
        return views_rank;
    }

    public void setViewsRank(final String val) {
        views_rank = stringToInt(val);
    }

    /** Remove common number markups such as a comma and convert the result
     * to an <code>int</code> data type
     * @param val String value in need of conversion
     * @return value of the passed <code>String</code>, or zero if no value found.
     * Decimals are dropped, not rounded.
     */
    private int stringToInt(final String val) {
        int retval = 0;
        try {
            retval = NUMBER_PURIFY.parse(val).intValue();
        }
        catch (Exception e) {}
        return retval;
    }
}

RSSWriter.java

package alexa;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

import org.apache.xerces.parsers.DOMParser;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;

/** Writes an RSS file with properties specific to the Alexa feed
 *
 * @author Niall Kennedy
 * @version 1.0
 * @see http://blogs.law.harvard.edu/tech/rss
 */
public final class RSSWriter {
    /** absolute file name.  file named alexa.xml will be generated in user's home directory.
     */
    private final String filename;
    /** RFC 822 compliant time format.
     */
    private final SimpleDateFormat RFC822 =
        new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z");
    /** time to live, set in minutes.
     * Set as a constant to 24 hours since Alexa uses daily stats  */
    private final int TTL = 1440;
    private Document doc;
    private final String title;
    private final String description;
    private final String link;
    private final String pubdate;

    /**
     *
     * @param title item title.  this is the RSS headline
     * @param description body of your message.
     * anything more than simple HTML may not be compatible with all aggregators
     * @param link full qualified hyperlink to the source page
     * @param pubdate date and time as published in the server's response
     */
    public RSSWriter(final String title, final String description, final String link, final String pubdate) {
        this.title = title;
        this.description = description;
        this.link = link;
        this.pubdate = pubdate;
        doc = null;
        StringBuffer filename = new StringBuffer(System.getProperty("user.home"));
        filename.append(File.separatorChar).append("alexa.xml");
        this.filename = filename.toString();
    }

    public void run() {
        File f = new File(filename);
        if (f.exists() && f.isFile()) {
            DOMParser parser = new DOMParser();
            try {
                parser.parse(filename);
                doc = parser.getDocument();
                Element last_update =
                    (Element) doc.getElementsByTagName("lastBuildDate").item(0);
                long last_update_millis =
                    RFC822.parse(last_update.getFirstChild().getNodeValue()).getTime();
                long pubdate_millis = RFC822.parse(pubdate).getTime();
                if ((pubdate_millis-last_update_millis)<(TTL*60*1000)) {
                    // add the item
                    Element channel = (Element) doc.getElementsByTagName("channel").item(0);
                    channel.appendChild(addItem());
                    Element new_update = doc.createElement("lastBuildDate");
                    new_update.appendChild(doc.createTextNode(RFC822.format(new Date())));
                    channel.replaceChild(new_update, last_update);
                }
            }
            catch (IOException e) {
                System.err.println("XML file import failed");
                e.printStackTrace();
            }
            catch (Exception e) {
                System.err.println("Parsing error");
                e.printStackTrace();
            }
        }
        else {
            try {
                doc = createBlankDocument();
            }
            catch (Exception e) {
                System.err.println("Parsing error in creation");
                e.printStackTrace();
            }
        }
        writeDocument();
    }

    /**
     * <p>Create a new DOM org.w3c.dom.Document object from the specified
     * object.</p>
     *
     * @return a new DOM Document.
     * @throws ParserConfigurationException if malformed doc
     */
    private Document createBlankDocument() throws ParserConfigurationException {
        // Use Sun's Java API for XML Parsing (JAXP) to create the
        // DOM Document
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder = dbf.newDocumentBuilder();
        doc = docBuilder.newDocument();
        Element root = doc.createElement("rss");
        root.setAttribute("version", "2.0");
        Element channel = doc.createElement("channel");

        // essential elements
        Element title = doc.createElement("title");
        title.appendChild(doc.createTextNode("Alexa Traffic Reporting Tool"));
        // the link node is required, but there is no parameter-less page for Alexa
        // so the required link field is set to Alexa's home page
        Element link = doc.createElement("link");
        link.appendChild(doc.createTextNode("http://www.alexa.com"));
        Element description = doc.createElement("description");
        description.appendChild(doc.createTextNode("Traffic analysis data from Alexa's toolbar"));

        // optional elements
        Element language = doc.createElement("language");
        language.appendChild(doc.createTextNode("en-us"));
	Element webmaster = doc.createElement("webMaster");
	webmaster.appendChild(doc.createTextNode("oreilly@niallkennedy.com"));
        Element copyright = doc.createElement("copyright");
        copyright.appendChild(doc.createTextNode("1996-2003, Alexa Internet, Inc."));
        Element generator = doc.createElement("generator");
        generator.appendChild(doc.createTextNode("Niall Kennedy's RSS tool"));
	Element docs = doc.createElement("docs");
	docs.appendChild(doc.createTextNode("http://blogs.law.harvard.edu/tech/rss"));
        Element ttl = doc.createElement("ttl");
        ttl.appendChild(doc.createTextNode(Integer.toString(TTL)));
        Element builddate = doc.createElement("lastBuildDate");
        builddate.appendChild(doc.createTextNode(RFC822.format(new Date())));

        //add them all
        channel.appendChild(title);
        channel.appendChild(link);
        channel.appendChild(description);
        channel.appendChild(language);
	channel.appendChild(webmaster);
        channel.appendChild(copyright);
        channel.appendChild(generator);
	channel.appendChild(docs);
        channel.appendChild(ttl);
        channel.appendChild(builddate);
        channel.appendChild(addItem());
        root.appendChild(channel);
        doc.appendChild(root);
        return doc;
    }

    private Element addItem() {
        Element item = doc.createElement("item");
        Element etitle = doc.createElement("title");
        etitle.appendChild(doc.createTextNode(title));
        Element edesc = doc.createElement("description");
        edesc.appendChild(doc.createTextNode(description));
        Element elink = doc.createElement("link");
        elink.appendChild(doc.createTextNode(link));
        Element epub = doc.createElement("pubDate");
        epub.appendChild(doc.createTextNode(pubdate));

        item.appendChild(etitle);
        item.appendChild(edesc);
        item.appendChild(elink);
        item.appendChild(epub);

        return item;
    }

    private void writeDocument() {
        try {
            OutputFormat fmt = new OutputFormat(doc, "UTF-8", true);
            try {
                FileOutputStream fout = new FileOutputStream(filename);
                XMLSerializer serial = new XMLSerializer(fout, fmt);
                serial.serialize(doc.getDocumentElement());
                fout.close();
            }
            catch (IOException e) {
                System.out.println("File write failed");
                System.out.println(filename);
                e.printStackTrace();
            }
        }
        catch (Exception e) {
            System.out.println("Parse failed");
            e.printStackTrace();
        }
    }
}


O'Reilly Home | Privacy Policy

© 2007 O'Reilly Media, Inc.
Website: | Customer Service: | Book issues:

All trademarks and registered trademarks appearing on oreilly.com are the property of their respective owners.