Retrieving the links in an HTML document in Java

You can use the HTMLEditorKit that includes an HTML parser. Have it parse your HTML and then enumerate the href attribute of all the “A” tags:

import javax.swing.text.html.*;
import javax.swing.text.*;
import java.util.*;
import java.net.*;
import java.io.*;
     
public class Main
{ 
   public static void main(String[] args) throws Exception {
      Vector v = getLinks(new URL("http://www.google.com"));
 
      for (int i=0; i<v.size(); i++) {
         System.out.println(v.elementAt(i));
      }
 
      System.exit(0);
   }
 
   public static Vector getLinks(URL url) throws Exception {
      Vector v = new Vector();
 
      URLConnection conn = url.openConnection();
      Reader r = new BufferedReader(new InputStreamReader(conn.getInputStream()));
      HTMLEditorKit editorKit = new HTMLEditorKit();
      Document doc = editorKit.createDefaultDocument();
 
      // The Document class does not yet handle charset's properly.
      doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
 
      try {
         editorKit.read(r, doc, 0);
 
         // loop through the HTML A tag elements 
         ElementIterator it = new ElementIterator(doc);
         Element elem;
         while ((elem = it.next()) != null) {
            SimpleAttributeSet s = (SimpleAttributeSet) elem.getAttributes().getAttribute(HTML.Tag.A);
            if (s != null) {
               v.addElement(s.getAttribute(HTML.Attribute.HREF));
            }
         }
      } catch (Exception e) {
         e.printStackTrace();
      }
 
      return v;
   }
}