Retrieving the links in an HTML document

LinkExtractor.java:

package htmltools;
 
import java.net.URL;
import java.net.InetAddress;
import java.net.MalformedURLException;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;

import java.util.Collection;
import java.util.ArrayList;

import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.*;

/**
 * This class takes a URL and, if it is valid, extracts all the external 
 * and local links and stores them in distinct ArrayLists.
 * It provides accessors to the two lists.
 */
public class LinkExtractor
{
    private URL m_zURL = null;
    private CallbackHandler m_zHandler;
    
    /**
     * Initialize the URL. 
     * You can provide URLs in the following form:<br>
     * <font color="blue">
     * http://www.something.ext<br>
     * www.something.ext<br>
     * something.ext<br>
     * </font>
     * In the last case the extractor assumes the URL is on the local
     * host and tries to open it at the local host
     */
    public LinkExtractor(String sURL) throws MalformedURLException, IOException {
	/* End-users don't like typing http, so 
	   we'll give them a hand */
	if (sURL.startsWith("www.")) {
	    sURL = "http://"+sURL;
	} else if (!sURL.startsWith("http")) {
	    /* there is neither an http protocol specified,
	       and the address does not start with www.
	       We will try to find this document on the local host.
	       Of course, this behavior does not cover all cases.
	       For example the user may try an ftp protocol, 
	       or, accostomed to modern day browsers, omit www altogether.
	       Oh well ...
	    */
	    InetAddress zAddr = InetAddress.getLocalHost();
	    sURL = "http://"+zAddr.getHostName()+"/"+sURL;
	}
	
	m_zURL = new URL(sURL);
	m_zHandler = new CallbackHandler();
	parse();
    }

    /**
     * return and ArrayList of all external links
     */
    public Collection getExternalLinks()  {
	if (null == m_zURL)
	    return null;
	return m_zHandler.m_clExternalLinks;    
    }
    
    /**
     * return and ArrayList of all local links
     */ 
    public Collection getLocalLinks()  {
	if (null == m_zURL)
	    return null;
	return m_zHandler.m_clLocalLinks;    
    }
  
  private void parse() throws IOException {
    // establish connection to site
      BufferedReader zReader = new BufferedReader
	  (new InputStreamReader(m_zURL.openStream()));
      // parse it to get the links
      new ParserDelegator().parse(zReader, m_zHandler, true);
      zReader.close();
  }
    
    private class CallbackHandler extends HTMLEditorKit.ParserCallback 
    {
	ArrayList m_clExternalLinks;
	ArrayList m_clLocalLinks;
	
	public CallbackHandler() {
	    
	    m_clExternalLinks = new ArrayList();
	    m_clLocalLinks = new ArrayList();
	}
	
	/**
	 * Invoked when text in the html document is encountered. Based on
	 * the current state, this will either do nothing
	 * or add an href attribute
	 */
	public void handleText(char[] data, int pos) {
	    // System.out.println(new String(data));
	}
	/**
	 * Invoked when a start tag is encountered. 
	 */
	public void handleStartTag(HTML.Tag zTag, 
				   MutableAttributeSet zAttributes,
				   int iPosition) {
	    String sLink = null;
	    
	    if (zTag.equals(HTML.Tag.A) ||
		zTag.equals(HTML.Tag.ADDRESS)) {
		
		sLink = (String)zAttributes.getAttribute(HTML.Attribute.HREF);
		if (null == sLink) { 
		    
		} else if (sLink.startsWith("http")) {
		    if (!m_clExternalLinks.contains((String)sLink)) {
			m_clExternalLinks.add((String)sLink);
		    }
		} else if (!m_clLocalLinks.contains((String)sLink)) {
		    m_clLocalLinks.add((String)sLink);
		} 
	    }
	}
	
	
	/**
	 * Invoked when the end of a tag is encountered. 
	 */
	public void handleEndTag(HTML.Tag t, int pos) {
	    
	}	    
   }

    /*
     * The main method is provided only for testing.
     */
    static void main(String[] asArgs) throws Exception {
	if (asArgs.length < 1) {
	    System.out.println("Usage: java GetLinks <URL>");
	    System.exit(0);
	}
	String sURL = asArgs[0];

	LinkExtractor gl = new LinkExtractor(sURL);
	ArrayList clLinks = (ArrayList) gl.getExternalLinks();
	
	for (int i=0;i<clLinks.size();i++) {
	    System.out.println((String)clLinks.get(i));
	}
	
	ArrayList clLocalLinks = (ArrayList) gl.getLocalLinks();	
	for (int i=0;i<clLocalLinks.size();i++) {
	    System.out.println((String)clLocalLinks.get(i));
	}
    }
}