LoginSignup
0
0

More than 5 years have passed since last update.

scala-scraperのValidation

Last updated at Posted at 2018-06-30

scalaのスクレイピングライブラリを触って見たメモです。
バリデーション周りを中心に動かしてみました。

基本的な使い方は下部の参考記事が詳しいです。

scala-scraper
https://github.com/ruippeixotog/scala-scraper#content-validation

build.sbt
libraryDependencies += "net.ruippeixotog" %% "scala-scraper" % "2.1.0"
sample.scala
import net.ruippeixotog.scalascraper.browser._
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.util.Validated._

object SampleApp {
  val content = """
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="utf-8">
            <title>Test page</title>
        </head>
        <body>
        <div id="wrapper">
            <div id="content">
                <section>
                    <h3>Section 1 h3</h3>
                    <p>Some text for testing</p>
                </section>

                <section>
                    <h3>Section 2 h3</h3>
                    <span>My Form</span>
                </section>
            </div>
        </div>
        </body>
        </html>"""

  def main(args: Array[String]): Unit = {
    val jsoupBrowser = JsoupBrowser()
    val doc = jsoupBrowser.parseString(content)

    // とりあえず普通に抽出
    println(doc >> text("div#content section h3"))
    // Section 1 h3

    // バリデーション
    println(doc >/~ validator(text("div#content section h3"))(_.nonEmpty))
    // Right(JsoupDocument(<!doctype html>......</html>)) 成功するとHTML全体

    println(doc >/~ validator(text("div#content section h3 not_exist"))(_.nonEmpty))
    // Left(()) 失敗の場合

    // バリデーションパターンを以下のようにもできる
    val succ = validator(text("title"))(_ == "AAAAAAAAAAAA") // 実際は "Test page" なので、errorsに
    val err = Seq(
      validator(text(".msg"), "Not logged in")(_.contains("sign in")),
      validator("#div content", "Empty contents")(_.isEmpty), // 該当するエラーのパターン
      validator(text("h1"),  "Internal Server Error")(_.contains("500"))
    )

    doc >/~ (succ, err) match {
      case VSuccess(content) => println(content)
      case VFailure(msg) => println(s"Error: $msg")
    }
    // Error: Empty contents
  }
}

参考 :pray:

https://qiita.com/harry0000/items/0c4e37ebb71d102cf8b0
https://qiita.com/ara_ta3/items/8bf8c6ffec86884fb6cb

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0