JavaのHTMLパーサとして今までCyberNeko HTML Parserを使っていたが、ページによってはパースエラーがでてしまうことがあった。なのでできるだけパースエラーが起きない簡単なHTMLパーサを自分で作ることにした。使い方は以下のような感じ。
import java.io.InputStreamReader; import java.io.BufferedReader; import java.io.IOException; import java.net.URL; import java.net.URLConnection; import java.util.Set; public class Test{ public static void main(String[] args) throws IOException{ URL url = new URL("http://d.hatena.ne.jp/d-kami/"); URLConnection connection = url.openConnection(); InputStreamReader isr = new InputStreamReader(connection.getInputStream(), "EUC-JP"); BufferedReader br = new BufferedReader(isr); HTMLReader reader = new HTMLReader(br); HTMLElement element; while((element = reader.readNextElement()) != null){ if(element.isText() || element.isCommentTag()){ System.out.println(element); }else if(element.isHTMLTag()){ System.out.println(element.getName()); Set<String> nameSet = element.getAttributeNameSet(); for(String name : nameSet){ System.out.printf("\t%s = %s\n", name, element.getAttribute(name)); } } } br.close(); } }
HTMLReaderのソース
import java.io.Reader; import java.io.IOException; public class HTMLReader { private Reader reader; private int state; private StringBuilder text = null; private StringBuilder tag = null; private static final int NO_STATE = 0; private static final int FOUND_TEXT = 1; private static final int FOUND_TAG = 2; private static final int FOUND_TAG_NAME = 3; private static final int FIND_ATTR_NAME = 4; private static final int FOUND_ATTR_NAME = 5; private static final int END_ATTR_NAME = 6; private static final int FIND_ATTR_VALUE = 7; private static final int FOUND_ATTR_VALUE = 8; public HTMLReader(Reader reader) { this.reader = reader; this.state = NO_STATE; } public HTMLElement readNextElement() throws IOException { char c; int n; while ((n = reader.read()) != -1) { c = (char) n; if (state == NO_STATE) { if (c == '<') { state = FOUND_TAG; tag = new StringBuilder(); } else { state = FOUND_TEXT; text = new StringBuilder(); text.append(c); } } else if (state == FOUND_TEXT) { if (c == '<') { state = FOUND_TAG; tag = new StringBuilder(); HTMLElement element = new HTMLElement(); element.setText(text.toString()); element.setHTMLTag(false); return element; } else { text.append(c); } } else if (state == FOUND_TAG) { if (c == '>') { state = NO_STATE; String htmlTag = tag.toString(); if (htmlTag.startsWith("!--") && htmlTag.endsWith("--")) { HTMLElement element = new HTMLElement(); element.setHTMLTag(true); element.setCommentTag(true); element.setText(htmlTag); element.setName("Comment"); return element; } else { return parseTag(htmlTag); } } else { tag.append(c); } } } if (state == FOUND_TEXT){ state = NO_STATE; HTMLElement element = new HTMLElement(); element.setText(text.toString()); element.setHTMLTag(false); return element; } else if (state == FOUND_TAG) { state = NO_STATE; String htmlTag = tag.toString(); if (htmlTag.startsWith("!--") && htmlTag.endsWith("--")) { HTMLElement element = new HTMLElement(); element.setHTMLTag(true); element.setCommentTag(true); element.setText(htmlTag); element.setName("Comment"); return element; } else { return parseTag(htmlTag); } } return null; } private HTMLElement parseTag(String tag) { HTMLElement element = new HTMLElement(); element.setHTMLTag(true); int parseTagState = NO_STATE; StringBuilder tagName = null; StringBuilder attrName = null; StringBuilder attrValue = null; char endChar = ' '; for (int i = 0; i < tag.length(); i++) { char c = tag.charAt(i); switch (parseTagState) { case NO_STATE: if (c == ' ') { continue; } parseTagState = FOUND_TAG_NAME; tagName = new StringBuilder(); tagName.append(c); break; case FOUND_TAG_NAME: if (c == ' ') { parseTagState = FIND_ATTR_NAME; element.setName(tagName.toString()); continue; } tagName.append(c); break; case FIND_ATTR_NAME: if (c == ' ') { continue; } attrName = new StringBuilder(); attrName.append(c); parseTagState = FOUND_ATTR_NAME; break; case FOUND_ATTR_NAME: if (c == ' ') { parseTagState = END_ATTR_NAME; continue; } else if (c == '=') { parseTagState = FIND_ATTR_VALUE; continue; } attrName.append(c); break; case END_ATTR_NAME: if (c == ' ') { continue; } else if (c == '=') { parseTagState = FIND_ATTR_VALUE; } else { String attr = attrName.toString(); element.setAttribute(attr, attr); parseTagState = FIND_ATTR_NAME; } break; case FIND_ATTR_VALUE: if (c == ' ') { continue; } else if (c == '\'' || c == '\"') { endChar = c; parseTagState = FOUND_ATTR_VALUE; attrValue = new StringBuilder(); } else { endChar = ' '; parseTagState = FOUND_ATTR_VALUE; attrValue = new StringBuilder(); attrValue.append(c); } continue; case FOUND_ATTR_VALUE: if (c == endChar) { String name = attrName.toString(); String value = attrValue.toString(); element.setAttribute(name, value); parseTagState = FIND_ATTR_NAME; } attrValue.append(c); break; } } switch (parseTagState) { case FOUND_TAG_NAME: element.setName(tagName.toString()); break; case FOUND_ATTR_NAME: case END_ATTR_NAME: case FIND_ATTR_VALUE: String attr = attrName.toString(); element.setAttribute(attr, attr); break; case FOUND_ATTR_VALUE: String strName = attrName.toString(); String value = attrValue.toString(); element.setAttribute(strName, value); break; } String elementName = element.getName(); if (elementName.length() > 0 && elementName.charAt(0) == '/') { element.setEndTag(true); } else { element.setStartTag(true); } return element; } }
HTMLElementのソース
import java.util.Set; import java.util.Map; import java.util.HashMap; public class HTMLElement { private String name; private String text; private Map<String, String> attributes; private boolean isHTMLTag; private boolean isStartTag; private boolean isCommentTag; private static final String EMPTY_TEXT = ""; public HTMLElement() { name = EMPTY_TEXT; attributes = new HashMap<String, String>(); } public void setName(String name) { this.name = name.toUpperCase(); } public String getName() { return this.name; } public void setAttribute(String key, String value) { attributes.put(key.toLowerCase(), value); } public String getAttribute(String key) { return attributes.get(key); } public void setText(String text) { this.text = text; setName("Text"); } public String getText() { return this.text; } public boolean isHTMLTag() { return isHTMLTag; } public boolean isText() { return !isHTMLTag; } public void setHTMLTag(boolean isHTMLTag) { this.isHTMLTag = isHTMLTag; } public boolean isStartTag() { return this.isStartTag; } public void setStartTag(boolean isStartTag) { this.isStartTag = isStartTag; } public boolean isEndTag() { return !this.isStartTag; } public void setEndTag(boolean isEndTag) { this.isStartTag = !isEndTag; } public boolean isCommentTag() { return this.isCommentTag; } public void setCommentTag(boolean isCommentTag) { this.isCommentTag = isCommentTag; } public Set<String> getAttributeNameSet() { return attributes.keySet(); } @Override public String toString() { if (this.isHTMLTag()) { if(this.isCommentTag()){ return "<" + this.getText() + ">"; } StringBuilder tagText = new StringBuilder(); tagText.append('<'); tagText.append(name); Set<String> nameSet = this.getAttributeNameSet(); for (String name : nameSet) { tagText.append(' ').append(name).append("=\""); tagText.append(getAttribute(name)).append("\""); } tagText.append('>'); return tagText.toString(); }else{ return this.getText(); } } }