More than 5 years have passed since last update.

HTMLからtagを抽出する

Java

Posted at 2013-12-28

イベント・ドリブンなHTMLパーサを使って HTML から任意の tag を抽出することができる。

ParseHTML.java

import java.util.*;
import java.io.*;
import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML;
import javax.swing.text.MutableAttributeSet;

public class ParseHTML extends ParserCallback {
  /*
   * 以下の handleXXX がcallbackされるので好きに使うがいい。
   * このサンプルでは 
   *   <img> の src  アトリビュート と
   *   <a>   の href アトリビュート
   * を抽出する。
   */
  public void flush() {}
  public void handleComment(char[] data, int pos) {}
  public void handleEndOfLineString(String eol) {}
  public void handleEndTag(HTML.Tag t, int pos) {}
  public void handleError(String errorMsg, int pos) {}
  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    if ( t == HTML.Tag.IMG ) {
        urls.add((String)a.getAttribute(HTML.Attribute.SRC));
    }
  }
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    if ( t == HTML.Tag.A ) {
        urls.add((String)a.getAttribute(HTML.Attribute.HREF));
    }
  }
  public void handleText(char[] data, int pos)  {}

  List<String> urls = new Vector<String>();

  // おためし
  public static void main(String[] args) {
    String html = 
      "<html><body>" + 
      "  <a href=\"http://qiita.com/episteme\">επιστημη</a>" +
      "  <IMG src=\"https://pbs.twimg.com/profile_images/54608127/epi_normal.jpg\"/>" +
      "</body></html>";

    try {
      // 文字列からReaderを作る
      InputStream stringstream = new ByteArrayInputStream(html.getBytes("utf-8")); 
      InputStreamReader reader = new InputStreamReader(stringstream, "utf-8");
      // ParserDelegator と callback(イベント・ハンドラ) を用意し
      ParserDelegator delegator = new ParserDelegator();
      ParseHTML callback = new ParseHTML();
      // parse開始!
      delegator.parse(reader, callback, true);
      reader.close();
      // 結果を出力
      for ( String result : callback.urls ) {
        System.out.println(result);
      }
    } catch ( Exception ex ) {
      ex.printStackTrace();
    }
  }
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up