The Code
Report.java
package alexa;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
/** Main class of the alexa package. Gathers data and composes the message.
* @author Niall Kennedy
* @version 1.0
*/
public final class Report {
private final DecimalFormat NUMBER_PRETTY = new DecimalFormat("#,##0");
private final String url;
private String generated;
private String name;
private StringBuffer body;
private TrafficBean bean;
public static void main(String [] args) {
String [] sites = null;
if (args.length>0) {
sites = args;
}
else {
// use a default set of sites
sites = {"oreilly.com", "manning.com",
"osborne.com", "wrox.com"};
}
for (int i=0; i<sites.length; ++i) {
Report r = new Report("http://" + sites[i]);
r.collectData();
r.writeData();
r = null;
}
sites = null;
}
/** It all starts here.
*
* @param url fully qualified Web address (example: http://www.google.com)
*/
public Report(final String url) {
this.url = url.toLowerCase().trim();
System.out.println("Processing " + url);
name = url;
body = new StringBuffer();
bean = new TrafficBean();
generated = null;
}
/** Gather data from various classes and compose the body of the message
*/
public void collectData() {
Website web = new Website(url);
Parse p = new Parse(Website.bodyFilter(web.retrieveSource()));
generated = web.getHeaderDate();
p.run();
name = p.getTitle();
bean = p.getBean();
body.append("<h1>Alexa Traffic Report for:</h1>");
body.append("<p>");
if (name!=null && !name.equalsIgnoreCase(url)) {
body.append("<strong>");
body.append(name).append("</strong><br />");
}
body.append("<a href=\"");
body.append(url).append("\">");
body.append(url).append("</a></p><br />");
body.append(showDestinations());
body.append(showReach());
body.append(showViews());
p = null;
web = null;
}
/** Writes data to RSS file
*/
public void writeData() {
RSSWriter rss = new RSSWriter(name, body.toString(), url, generated);
rss.run();
rss = null;
}
/** Show the top subdomains in order of decending popularity
*
* @return a paragraph detailing subdomain activity, or an empty <code>String</code> if none exist
*/
private String showDestinations() {
StringBuffer retval = new StringBuffer();
Map sites = bean.getSites();
if (sites!=null && sites.size()>0) {
retval.append("<a href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#web_hosts\"><font size=\"+1\">Most Popular Subdomains</font></a><br />");
ArrayList keys = new ArrayList(sites.keySet());
Collections.sort(keys);
for(int i=keys.size()-1; i>0; --i) {
int pct = Integer.parseInt(keys.get(i).toString());
String site = sites.get(new Integer(pct)).toString();
retval.append(pct);
retval.append(" % -- ");
if (site.equalsIgnoreCase("Other websites")) {
retval.append(site);
}
else {
retval.append("<a href=\"http://");
retval.append(site);
retval.append("\">");
retval.append(site);
retval.append("</a>");
}
retval.append("<br />");
site = null;
}
keys = null;
}
sites = null;
retval.append("<br />");
return retval.toString();
}
/** Show the total reach of the domain
*
* @return paragraph detailing reach per million and database rank,
* or empty <code>String if no data available
*/
private String showReach() {
StringBuffer retval = new StringBuffer();
int reach = bean.getReachPerMillion();
int reach_rank = bean.getReachRank();
if (reach>0 || reach_rank>0) {
retval.append("<a name=\"Learn More\" href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#reach\"><font size=\"+1\">Domain Reach</font></a><br />Reach per million : ");
retval.append(NUMBER_PRETTY.format(reach));
retval.append("<br />");
retval.append("Reach Rank : ");
retval.append(NUMBER_PRETTY.format(reach_rank));
retval.append("<br /><br />");
}
return retval.toString();
}
/** <p>Paragraph detailing how many pages an average viewer navigates to
* within the domain and how total views compare to the entire database of domains.</p>
*
* @return paragraph detailing a site's popularity and depth
*/
private String showViews() {
StringBuffer retval = new StringBuffer();
int views = bean.getViewsPerUser();
int views_rank = bean.getViewsRank();
if (views>0 || views_rank>0) {
retval.append("<a name=\"Learn More\" href=\"http://pages.alexa.com/prod_serv/traffic_learn_more.html#page_views\"><font size=\"+1\">Page Views</font></a><br />Page Views Per User : ");
retval.append(NUMBER_PRETTY.format(views));
retval.append("<br />Page Views Rank : ");
retval.append(NUMBER_PRETTY.format(views_rank));
}
return retval.toString();
}
}
Website.java
package alexa;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.HttpURLConnection;
/** Opens a HTTP connection and pulls the source code of Alexa's traffic detail site
*
* @author Niall Kennedy
* @version 1.0
*/
public final class Website {
private final String BASEURL="http://www.alexa.com/data/details/traffic_details?url=";
private final String url_location;
private String header_date;
/**
*
* @param url parameter URL of a Web domain. ex: http://domain.tld
*/
public Website(final String url) {
this.url_location = BASEURL + url;
}
public String getURLLocation() {
return url_location;
}
public String getHeaderDate() {
return header_date;
}
/** Retrieves the full source of the requested page.
* Sets header_date to date supplied by the server
*
* @return full source code of the file located at url_location
*/
public String retrieveSource() {
StringBuffer source = new StringBuffer();
try {
URL u = new URL(getURLLocation());
HttpURLConnection connect = (HttpURLConnection) u.openConnection();
// Masquerade as a common Web browser request
connect.setRequestProperty("User-Agent", "Mozilla/5.0");
connect.setUseCaches(false);
header_date = connect.getHeaderField("Date");
BufferedReader html = new BufferedReader(new InputStreamReader(connect.getInputStream()));
String line = null;
while ((line=html.readLine())!=null) {
source.append(line);
}
line = null;
html.close();
html = null;
u = null;
connect.disconnect();
connect = null;
}
catch (MalformedURLException e) {
source = null;
}
catch (IOException e) {
System.err.println("I/O Error");
System.err.println(url_location);
e.printStackTrace();
}
if (source==null) {
return null;
}
else {
return source.toString();
}
}
/** Given the complete source, return only the HTML body
*
* @param fullpage complete source code of the provided destination
* @return body of the HTML page, or null if lowercase body tags not found
*/
public static String bodyFilter(final String fullpage) {
if (fullpage==null) {
return null;
}
String retval = null;
String start_tag = "<body";
String end_tag = "body>";
int start = fullpage.indexOf(start_tag);
int end = fullpage.lastIndexOf(end_tag);
if (start>=0 && end>0) {
retval = fullpage.substring(start, end+end_tag.length());
}
return retval;
}
}
Parse.java
package alexa;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Map;
/** Parse a HTML <code>String</code> representing Alexa's traffic page
* @author Niall Kennedy
* @version 1.0
*/
public final class Parse {
/** entire text to parse */
private final String document;
/** Company associated with the given domain */
private String title;
/** Parsed data goes here */
private TrafficBean bean;
/** speed up the String search by keeping track of the areas already combed over */
private int place;
/**
* @param document source code to parse through
*/
public Parse(final String document) {
this.document = document;
bean = new TrafficBean();
title = null;
place = 0;
}
public void run() {
setTitle();
System.out.println(title);
bean.setSites(getSiteDomains());
bean.setReachPerMillion(getTodayStat());
bean.setReachRank(getTodayStat());
bean.setViewsPerUser(getTodayStat());
bean.setViewsRank(getTodayStat());
}
public String getTitle() {
return title;
}
public TrafficBean getBean() {
return bean;
}
/** Narrow down the length of a search string by defining identifying text occuring after the place index
*
* @param start_text
* @param end_text
* @return
*/
private String snipIt(final String start_text, final String end_text) {
String retval = null;
try {
int start_position = document.indexOf(start_text, place);
if (start_position>=place) {
int end_position = document.indexOf(end_text, start_position);
if (end_position>start_position) {
retval = document.substring(start_position, end_position);
place = end_position;
}
}
}
catch (Exception e) {}
return retval;
}
private void setTitle() {
String start_tag = "<span class=\"title\">";
String end_tag = "</span>";
try {
title = snipIt(start_tag, end_tag);
}
catch (Exception e) {}
start_tag = null;
end_tag = null;
}
private Map getSiteDomains() {
Hashtable retval = new Hashtable();
String snip = snipIt("<span class=\"titleO\">Where do people go on", "<hr size=\"1\">");
// cycle through the list of subdomains
for (int start = snip.indexOf("<li>");
start>0;
start = snip.indexOf("<li>", start)) {
int end = snip.indexOf("~", start);
if (end>0) {
try {
String site = snip.substring(start+4, end).trim();
start = snip.indexOf("<b>", end);
if (start>0) {
// grab the number only
end = snip.indexOf("%</b>", start);
int pct = Integer.parseInt(snip.substring(start+3,end));
retval.put(new Integer(pct), site);
}
site = null;
}
catch (Exception e) {}
}
}
snip = null;
return Collections.unmodifiableMap(retval);
}
/** Each table is formatted the same, so we can reuse the same method on each.
*
* @return today's statistic (row 1, column 1)
*/
private String getTodayStat() {
String retval = null;
String snip = snipIt("<table", "</table>");
String tag = "</tr><tr><td class=\"bodyBold\" align=\"center\" bgcolor=\"#ffffff\">";
try {
int start = snip.indexOf(tag);
if (start>0) {
start += tag.length();
int end = snip.indexOf("</td>", start);
retval = snip.substring(start, end);
}
}
catch (Exception e) {}
snip = null;
return retval;
}
}
TrafficBean.java
package alexa;
import java.text.NumberFormat;
import java.util.Map;
/** Standard entity bean style. Good for holding related data
*
* @author Niall Kennedy
* @version 1.0
*/
public final class TrafficBean {
/** Adds commas to large numbers.
*/
public final NumberFormat NUMBER_PURIFY = NumberFormat.getInstance();
/** subdomains. key of percentage expressed as an <code>Integer</code> and
* value of property name */
private Map sites;
/** The percentage of one million Internet users who visit the given site */
private int reach_per_million;
/** A ranking of all sites based solely on their reach */
private int reach_rank;
/** The number of pages viewed by Alexa Toolbar users.
* Multiple page views of the same page made by the same user on the same day are counted only once.*/
private int views_per_user;
/** a ranking of all sites based solely on the total number of page views (not page views per user) */
private int views_rank;
/** Initialize all variables
*/
public TrafficBean() {
clearAll();
}
/** clear all class variables
*/
public void clearAll() {
sites = null;
reach_per_million = 0;
reach_rank = 0;
views_per_user = 0;
views_rank = 0;
}
public Map getSites() {
return sites;
}
public void setSites(final Map val) {
this.sites = val;
}
public int getReachPerMillion() {
return reach_per_million;
}
public void setReachPerMillion(final String val) {
reach_per_million = stringToInt(val);
}
public int getReachRank() {
return reach_rank;
}
public void setReachRank(final String val) {
reach_rank = stringToInt(val);
}
public int getViewsPerUser() {
return views_per_user;
}
public void setViewsPerUser(final String val) {
views_per_user = stringToInt(val);
}
public int getViewsRank() {
return views_rank;
}
public void setViewsRank(final String val) {
views_rank = stringToInt(val);
}
/** Remove common number markups such as a comma and convert the result
* to an <code>int</code> data type
* @param val String value in need of conversion
* @return value of the passed <code>String</code>, or zero if no value found.
* Decimals are dropped, not rounded.
*/
private int stringToInt(final String val) {
int retval = 0;
try {
retval = NUMBER_PURIFY.parse(val).intValue();
}
catch (Exception e) {}
return retval;
}
}
RSSWriter.java
package alexa;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.apache.xerces.parsers.DOMParser;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
/** Writes an RSS file with properties specific to the Alexa feed
*
* @author Niall Kennedy
* @version 1.0
* @see http://blogs.law.harvard.edu/tech/rss
*/
public final class RSSWriter {
/** absolute file name. file named alexa.xml will be generated in user's home directory.
*/
private final String filename;
/** RFC 822 compliant time format.
*/
private final SimpleDateFormat RFC822 =
new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z");
/** time to live, set in minutes.
* Set as a constant to 24 hours since Alexa uses daily stats */
private final int TTL = 1440;
private Document doc;
private final String title;
private final String description;
private final String link;
private final String pubdate;
/**
*
* @param title item title. this is the RSS headline
* @param description body of your message.
* anything more than simple HTML may not be compatible with all aggregators
* @param link full qualified hyperlink to the source page
* @param pubdate date and time as published in the server's response
*/
public RSSWriter(final String title, final String description, final String link, final String pubdate) {
this.title = title;
this.description = description;
this.link = link;
this.pubdate = pubdate;
doc = null;
StringBuffer filename = new StringBuffer(System.getProperty("user.home"));
filename.append(File.separatorChar).append("alexa.xml");
this.filename = filename.toString();
}
public void run() {
File f = new File(filename);
if (f.exists() && f.isFile()) {
DOMParser parser = new DOMParser();
try {
parser.parse(filename);
doc = parser.getDocument();
Element last_update =
(Element) doc.getElementsByTagName("lastBuildDate").item(0);
long last_update_millis =
RFC822.parse(last_update.getFirstChild().getNodeValue()).getTime();
long pubdate_millis = RFC822.parse(pubdate).getTime();
if ((pubdate_millis-last_update_millis)<(TTL*60*1000)) {
// add the item
Element channel = (Element) doc.getElementsByTagName("channel").item(0);
channel.appendChild(addItem());
Element new_update = doc.createElement("lastBuildDate");
new_update.appendChild(doc.createTextNode(RFC822.format(new Date())));
channel.replaceChild(new_update, last_update);
}
}
catch (IOException e) {
System.err.println("XML file import failed");
e.printStackTrace();
}
catch (Exception e) {
System.err.println("Parsing error");
e.printStackTrace();
}
}
else {
try {
doc = createBlankDocument();
}
catch (Exception e) {
System.err.println("Parsing error in creation");
e.printStackTrace();
}
}
writeDocument();
}
/**
* <p>Create a new DOM org.w3c.dom.Document object from the specified
* object.</p>
*
* @return a new DOM Document.
* @throws ParserConfigurationException if malformed doc
*/
private Document createBlankDocument() throws ParserConfigurationException {
// Use Sun's Java API for XML Parsing (JAXP) to create the
// DOM Document
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = dbf.newDocumentBuilder();
doc = docBuilder.newDocument();
Element root = doc.createElement("rss");
root.setAttribute("version", "2.0");
Element channel = doc.createElement("channel");
// essential elements
Element title = doc.createElement("title");
title.appendChild(doc.createTextNode("Alexa Traffic Reporting Tool"));
// the link node is required, but there is no parameter-less page for Alexa
// so the required link field is set to Alexa's home page
Element link = doc.createElement("link");
link.appendChild(doc.createTextNode("http://www.alexa.com"));
Element description = doc.createElement("description");
description.appendChild(doc.createTextNode("Traffic analysis data from Alexa's toolbar"));
// optional elements
Element language = doc.createElement("language");
language.appendChild(doc.createTextNode("en-us"));
Element webmaster = doc.createElement("webMaster");
webmaster.appendChild(doc.createTextNode("oreilly@niallkennedy.com"));
Element copyright = doc.createElement("copyright");
copyright.appendChild(doc.createTextNode("1996-2003, Alexa Internet, Inc."));
Element generator = doc.createElement("generator");
generator.appendChild(doc.createTextNode("Niall Kennedy's RSS tool"));
Element docs = doc.createElement("docs");
docs.appendChild(doc.createTextNode("http://blogs.law.harvard.edu/tech/rss"));
Element ttl = doc.createElement("ttl");
ttl.appendChild(doc.createTextNode(Integer.toString(TTL)));
Element builddate = doc.createElement("lastBuildDate");
builddate.appendChild(doc.createTextNode(RFC822.format(new Date())));
//add them all
channel.appendChild(title);
channel.appendChild(link);
channel.appendChild(description);
channel.appendChild(language);
channel.appendChild(webmaster);
channel.appendChild(copyright);
channel.appendChild(generator);
channel.appendChild(docs);
channel.appendChild(ttl);
channel.appendChild(builddate);
channel.appendChild(addItem());
root.appendChild(channel);
doc.appendChild(root);
return doc;
}
private Element addItem() {
Element item = doc.createElement("item");
Element etitle = doc.createElement("title");
etitle.appendChild(doc.createTextNode(title));
Element edesc = doc.createElement("description");
edesc.appendChild(doc.createTextNode(description));
Element elink = doc.createElement("link");
elink.appendChild(doc.createTextNode(link));
Element epub = doc.createElement("pubDate");
epub.appendChild(doc.createTextNode(pubdate));
item.appendChild(etitle);
item.appendChild(edesc);
item.appendChild(elink);
item.appendChild(epub);
return item;
}
private void writeDocument() {
try {
OutputFormat fmt = new OutputFormat(doc, "UTF-8", true);
try {
FileOutputStream fout = new FileOutputStream(filename);
XMLSerializer serial = new XMLSerializer(fout, fmt);
serial.serialize(doc.getDocumentElement());
fout.close();
}
catch (IOException e) {
System.out.println("File write failed");
System.out.println(filename);
e.printStackTrace();
}
}
catch (Exception e) {
System.out.println("Parse failed");
e.printStackTrace();
}
}
}