Some time we may require to parse a HTML file. In this example i used jsoup: Java HTML Parser because its simple to configure and easy to understand with lots of feature which can be customize according to our requirement. http://jsoup.org/
We used the jsoup jar (version 1.6.1), To run this example download jsoup-1.6.1.jar and set it in your classpath.
We can either hit a HTTP URL or read from a HTML file according to our requirement.
The parse(File in, String charsetName) method loads and parses a HTML file.
File input = new File("/tmp/input.html");
Document doc = Jsoup.parse(input,"UTF-8");
The connect(String url) method creates a new Connection, and get() fetches and parses a HTML file.
Document doc = Jsoup.connect("http://example.com/").get();
Also you can parse a document from a String
String html = "<html><head><title>First parse</title></head>"
+ "<body><p>Parsed HTML into a doc.</p></body></html>";
Document doc = Jsoup.parse(html);
Lets look at API for jsoup. http://jsoup.org/apidocs/
Lets see a sample code, you can customize it according to your requirement. Just set jsoup-1.6.1.jar in your classpath, provide a HTMl file to read and run the program.
/* SimpleHTMLParser.java */
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SimpleHTMLParser {
public static void main(String[] args) throws IOException {
//File htmlFile = new File("C:/temp/VariableDeclarations.html");
//Document doc = Jsoup.parse(htmlFile, "UTF-8");
Document doc = Jsoup.connect("http://localhost:8080/temp/VariableDeclarations.html").get();
Element headElement = doc.head();//Accessor to the document's head element.
Elements allHeadElements = headElement.children();//Get this element's child elements.
//For formatting output
String indent = "";
parseElements(allHeadElements,indent);
Element bodyElement = doc.body();//Accessor to the document's body element.
Elements allBodyElements = bodyElement.children();//Get this element's child elements.
parseElements(allBodyElements,indent);
}
public static void parseElements(Elements allElements,String indent) {
//For formatting output
indent += " ";
if(!allElements.isEmpty()) {
for(int i=0;i<allElements.size();i++) {
Element eachChildrenElement = allElements.get(i);
//If the tag has text values
if(eachChildrenElement.hasText()) {
Elements allChlidElements = eachChildrenElement.children();
//If Elements has child nodes, then call parseElements() recursively
if(allChlidElements.size() > 0) {
System.out.println(indent+eachChildrenElement.tagName());
parseElements(allChlidElements,indent);
}
//Else this is a leaf node, print the values
else {
System.out.println(indent+eachChildrenElement.tagName()+" : "+eachChildrenElement.text());
}
}
//Else check for attributes
else {
Attributes attrs = eachChildrenElement.attributes();
Iterator<Attribute> itAttrs = attrs.iterator();
System.out.print(indent+eachChildrenElement.tagName()+" : Attributes :: ");
while(itAttrs.hasNext()) {
Attribute attr = itAttrs.next();
System.out.print(attr.getKey()+" - "+attr.getValue()+", ");
}
//For formatting output
System.out.println("");
}
}//for
}
}//parseElements
}
We used the jsoup jar (version 1.6.1), To run this example download jsoup-1.6.1.jar and set it in your classpath.
We can either hit a HTTP URL or read from a HTML file according to our requirement.
The parse(File in, String charsetName) method loads and parses a HTML file.
File input = new File("/tmp/input.html");
Document doc = Jsoup.parse(input,"UTF-8");
The connect(String url) method creates a new Connection, and get() fetches and parses a HTML file.
Document doc = Jsoup.connect("http://example.com/").get();
Also you can parse a document from a String
String html = "<html><head><title>First parse</title></head>"
+ "<body><p>Parsed HTML into a doc.</p></body></html>";
Document doc = Jsoup.parse(html);
Lets look at API for jsoup. http://jsoup.org/apidocs/
Lets see a sample code, you can customize it according to your requirement. Just set jsoup-1.6.1.jar in your classpath, provide a HTMl file to read and run the program.
/* SimpleHTMLParser.java */
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SimpleHTMLParser {
public static void main(String[] args) throws IOException {
//File htmlFile = new File("C:/temp/VariableDeclarations.html");
//Document doc = Jsoup.parse(htmlFile, "UTF-8");
Document doc = Jsoup.connect("http://localhost:8080/temp/VariableDeclarations.html").get();
Element headElement = doc.head();//Accessor to the document's head element.
Elements allHeadElements = headElement.children();//Get this element's child elements.
//For formatting output
String indent = "";
parseElements(allHeadElements,indent);
Element bodyElement = doc.body();//Accessor to the document's body element.
Elements allBodyElements = bodyElement.children();//Get this element's child elements.
parseElements(allBodyElements,indent);
}
public static void parseElements(Elements allElements,String indent) {
//For formatting output
indent += " ";
if(!allElements.isEmpty()) {
for(int i=0;i<allElements.size();i++) {
Element eachChildrenElement = allElements.get(i);
//If the tag has text values
if(eachChildrenElement.hasText()) {
Elements allChlidElements = eachChildrenElement.children();
//If Elements has child nodes, then call parseElements() recursively
if(allChlidElements.size() > 0) {
System.out.println(indent+eachChildrenElement.tagName());
parseElements(allChlidElements,indent);
}
//Else this is a leaf node, print the values
else {
System.out.println(indent+eachChildrenElement.tagName()+" : "+eachChildrenElement.text());
}
}
//Else check for attributes
else {
Attributes attrs = eachChildrenElement.attributes();
Iterator<Attribute> itAttrs = attrs.iterator();
System.out.print(indent+eachChildrenElement.tagName()+" : Attributes :: ");
while(itAttrs.hasNext()) {
Attribute attr = itAttrs.next();
System.out.print(attr.getKey()+" - "+attr.getValue()+", ");
}
//For formatting output
System.out.println("");
}
}//for
}
}//parseElements
}