マイペースなプログラミング日記

DTMやプログラミングにお熱なd-kamiがマイペースに書くブログ

簡易HTMLパーサを作ってみた

JavaのHTMLパーサとして今までCyberNeko HTML Parserを使っていたが、ページによってはパースエラーがでてしまうことがあった。なのでできるだけパースエラーが起きない簡単なHTMLパーサを自分で作ることにした。使い方は以下のような感じ。

import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.IOException;

import java.net.URL;
import java.net.URLConnection;

import java.util.Set;

public class Test{
    public static void main(String[] args) throws IOException{
        URL url = new URL("http://d.hatena.ne.jp/d-kami/");
        URLConnection connection = url.openConnection();

        InputStreamReader isr = new InputStreamReader(connection.getInputStream(), "EUC-JP");
        BufferedReader br = new BufferedReader(isr);
        HTMLReader reader = new HTMLReader(br);
        HTMLElement element;

        while((element = reader.readNextElement()) != null){
            if(element.isText() || element.isCommentTag()){
                System.out.println(element);
            }else if(element.isHTMLTag()){
                System.out.println(element.getName());

                Set<String> nameSet = element.getAttributeNameSet();
                for(String name : nameSet){
                    System.out.printf("\t%s = %s\n", name, element.getAttribute(name));
                }
            }
        }

        br.close();
    }
}

HTMLReaderのソース

import java.io.Reader;
import java.io.IOException;

public class HTMLReader {

    private Reader reader;
    private int state;
    private StringBuilder text = null;
    private StringBuilder tag = null;

    private static final int NO_STATE = 0;
    private static final int FOUND_TEXT = 1;
    private static final int FOUND_TAG = 2;
    private static final int FOUND_TAG_NAME = 3;
    private static final int FIND_ATTR_NAME = 4;
    private static final int FOUND_ATTR_NAME = 5;
    private static final int END_ATTR_NAME = 6;
    private static final int FIND_ATTR_VALUE = 7;
    private static final int FOUND_ATTR_VALUE = 8;

    public HTMLReader(Reader reader) {
        this.reader = reader;
        this.state = NO_STATE;
    }

    public HTMLElement readNextElement() throws IOException {
        char c;
        int n;

        while ((n = reader.read()) != -1) {
            c = (char) n;
            if (state == NO_STATE) {
                if (c == '<') {
                    state = FOUND_TAG;
                    tag = new StringBuilder();
                } else {
                    state = FOUND_TEXT;
                    text = new StringBuilder();
                    text.append(c);
                }
            } else if (state == FOUND_TEXT) {
                if (c == '<') {
                    state = FOUND_TAG;
                    tag = new StringBuilder();

                    HTMLElement element = new HTMLElement();
                    element.setText(text.toString());
                    element.setHTMLTag(false);
                    return element;
                } else {
                    text.append(c);
                }
            } else if (state == FOUND_TAG) {
                if (c == '>') {
                    state = NO_STATE;

                    String htmlTag = tag.toString();
                    if (htmlTag.startsWith("!--") && htmlTag.endsWith("--")) {
                        HTMLElement element = new HTMLElement();
                        element.setHTMLTag(true);
                        element.setCommentTag(true);
                        element.setText(htmlTag);
                        element.setName("Comment");

                        return element;
                    } else {
                        return parseTag(htmlTag);
                    }
                } else {
                    tag.append(c);
                }
            }
        }

        if (state == FOUND_TEXT){
            state = NO_STATE;
            HTMLElement element = new HTMLElement();
            element.setText(text.toString());
            element.setHTMLTag(false);
            return element;
        } else if (state == FOUND_TAG) {
            state = NO_STATE;
            String htmlTag = tag.toString();
            if (htmlTag.startsWith("!--") && htmlTag.endsWith("--")) {
                HTMLElement element = new HTMLElement();
                element.setHTMLTag(true);
                element.setCommentTag(true);
                element.setText(htmlTag);
                element.setName("Comment");

                return element;
            } else {
                return parseTag(htmlTag);
            }
        }
        
        return null;
    }

    private HTMLElement parseTag(String tag) {
        HTMLElement element = new HTMLElement();
        element.setHTMLTag(true);
        int parseTagState = NO_STATE;
        StringBuilder tagName = null;
        StringBuilder attrName = null;
        StringBuilder attrValue = null;

        char endChar = ' ';

        for (int i = 0; i < tag.length(); i++) {
            char c = tag.charAt(i);
            switch (parseTagState) {
                case NO_STATE:
                    if (c == ' ') {
                        continue;
                    }

                    parseTagState = FOUND_TAG_NAME;
                    tagName = new StringBuilder();
                    tagName.append(c);
                    break;

                case FOUND_TAG_NAME:
                    if (c == ' ') {
                        parseTagState = FIND_ATTR_NAME;
                        element.setName(tagName.toString());
                        continue;
                    }

                    tagName.append(c);

                    break;

                case FIND_ATTR_NAME:
                    if (c == ' ') {
                        continue;
                    }

                    attrName = new StringBuilder();
                    attrName.append(c);
                    parseTagState = FOUND_ATTR_NAME;
                    break;

                case FOUND_ATTR_NAME:
                    if (c == ' ') {
                        parseTagState = END_ATTR_NAME;
                        continue;
                    } else if (c == '=') {
                        parseTagState = FIND_ATTR_VALUE;
                        continue;
                    }
                    attrName.append(c);
                    break;

                case END_ATTR_NAME:
                    if (c == ' ') {
                        continue;
                    } else if (c == '=') {
                        parseTagState = FIND_ATTR_VALUE;
                    } else {
                        String attr = attrName.toString();
                        element.setAttribute(attr, attr);
                        parseTagState = FIND_ATTR_NAME;
                    }

                    break;

                case FIND_ATTR_VALUE:
                    if (c == ' ') {
                        continue;
                    } else if (c == '\'' || c == '\"') {
                        endChar = c;
                        parseTagState = FOUND_ATTR_VALUE;
                        attrValue = new StringBuilder();
                    } else {
                        endChar = ' ';
                        parseTagState = FOUND_ATTR_VALUE;
                        attrValue = new StringBuilder();
                        attrValue.append(c);
                    }

                    continue;

                case FOUND_ATTR_VALUE:
                    if (c == endChar) {
                        String name = attrName.toString();
                        String value = attrValue.toString();
                        element.setAttribute(name, value);

                        parseTagState = FIND_ATTR_NAME;
                    }

                    attrValue.append(c);

                    break;
            }
        }

        switch (parseTagState) {
            case FOUND_TAG_NAME:
                element.setName(tagName.toString());
                break;

            case FOUND_ATTR_NAME:
            case END_ATTR_NAME:
            case FIND_ATTR_VALUE:
                String attr = attrName.toString();
                element.setAttribute(attr, attr);
                break;

            case FOUND_ATTR_VALUE:
                String strName = attrName.toString();
                String value = attrValue.toString();
                element.setAttribute(strName, value);
                break;
        }

        String elementName = element.getName();
        if (elementName.length() > 0 && elementName.charAt(0) == '/') {
            element.setEndTag(true);
        } else {
            element.setStartTag(true);
        }

        return element;
    }
}

HTMLElementのソース

import java.util.Set;
import java.util.Map;
import java.util.HashMap;

public class HTMLElement {
    private String name;
    private String text;
    private Map<String, String> attributes;
    private boolean isHTMLTag;
    private boolean isStartTag;
    private boolean isCommentTag;

    private static final String EMPTY_TEXT = "";
    
    public HTMLElement() {
        name = EMPTY_TEXT;
        attributes = new HashMap<String, String>();
    }

    public void setName(String name) {
        this.name = name.toUpperCase();
    }

    public String getName() {
        return this.name;
    }

    public void setAttribute(String key, String value) {
        attributes.put(key.toLowerCase(), value);
    }

    public String getAttribute(String key) {
        return attributes.get(key);
    }

    public void setText(String text) {
        this.text = text;
        setName("Text");
    }

    public String getText() {
        return this.text;
    }

    public boolean isHTMLTag() {
        return isHTMLTag;
    }

    public boolean isText() {
        return !isHTMLTag;
    }

    public void setHTMLTag(boolean isHTMLTag) {
        this.isHTMLTag = isHTMLTag;
    }

    public boolean isStartTag() {
        return this.isStartTag;
    }

    public void setStartTag(boolean isStartTag) {
        this.isStartTag = isStartTag;
    }

    public boolean isEndTag() {
        return !this.isStartTag;
    }

    public void setEndTag(boolean isEndTag) {
        this.isStartTag = !isEndTag;
    }

    public boolean isCommentTag() {
        return this.isCommentTag;
    }

    public void setCommentTag(boolean isCommentTag) {
        this.isCommentTag = isCommentTag;
    }

    public Set<String> getAttributeNameSet() {
        return attributes.keySet();
    }

    @Override
    public String toString() {
        if (this.isHTMLTag()) {
            if(this.isCommentTag()){
                return "<" + this.getText() + ">";
            }
            
            StringBuilder tagText = new StringBuilder();
            tagText.append('<');
            tagText.append(name);

            Set<String> nameSet = this.getAttributeNameSet();
            for (String name : nameSet) {
                tagText.append(' ').append(name).append("=\"");
                tagText.append(getAttribute(name)).append("\"");
            }

            tagText.append('>');

            return tagText.toString();
        }else{
            return this.getText();
        }
        
    }
}