More than 5 years have passed since last update.

tagsoupとregex-tdfaでHTMLからリンク抽出

Haskell

Posted at 2012-04-06

Wikipediaの全言語articleダンプをダウンロードするスクリプト。

{-# LANGUAGE OverloadedStrings, QuasiQuotes #-}                                                
                                                                                               
import Control.Applicative                                                                     
import Control.Monad
import qualified Data.ByteString.Lazy.Char8 as B                                               
import Network.HTTP.Conduit                                                                    
import System.Cmd                                                                              
import System.Environment                                                                      
import System.Process.QQ                                                                       
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Tree                                                                  
import Text.Printf                                                                             
import Text.Regex.TDFA                                                                         
                                                                                               
baseUrl = "http://dumps.wikimedia.org/"                                                        

extractLinks url regex = do                                                                    
  body <- B.unpack <$> simpleHttp url                                                          
  let tree = tagTree $ parseTags body                                                          
  return                                                                                       
    [ (name, link)                                                                             
    | (TagBranch "a" attrs [TagLeaf (TagText name)]) <- universeTree tree
    , name =~ (regex :: String)                                                                
    , let Just link = lookup "href" attrs                                                      
    ]                                                                                          
                                                                                               
main :: IO ()                                                                                  
main = do
  args <- getArgs                                                                              
  case args of                                                                                 
    [dest] -> do                                                                               
      [cmd|rm -rf #{dest}|]                                                                    
      [cmd|mkdir -p #{dest}|]                                                                  
      langs <- extractLinks (baseUrl ++ "backup-index.html") ".+wiki$"
      forM_ (zip [1..] langs) $ \(ix, (name, url)) -> do                                       
        printf "[%d/%d]: %s\n" (ix :: Int) (length langs) name                                 
        links <- extractLinks (baseUrl ++ url) ".+pages-articles\\.xml\\..*"                   
        forM_ links $ \(name, url) -> do                                                       
          let aurl = baseUrl ++ url                                                            
          putStrLn $ "> " ++ name ++ ": " ++ url
          system $ "aria2c -d " ++ dest ++ " " ++ baseUrl ++ url                               
    _ -> putStrLn "usage: runhaskell main.hs <dest-dir>"

tagsoup意外と使いやすかった。
それにしてもimportが多くなる…

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up