python3 ウエブスクレイピングことはじめ #Python3

流行りのウエブスクレイピングを試してみた。
対象はこういうの大丈夫そうなスラド。

定番のBeautifulsoup。
初めてやってみたけど簡単だな！

アタマの3つを拾ってURLをHTMLに吐き出してみた。
嫁が帰ってくるまでの30分、勢いで書いたのでコードが美しくないのはすみません。

test.py

'''URLを扱うモジュール'''
import urllib.request
from bs4 import BeautifulSoup
import webbrowser
import os

'''スクレイピング機能クラス'''
class Scraper:

    def __init__(self, site):
        self.site = site

    '''BeatifulsoupでURLスクレイプ'''
    def scrape(self):
        geturl = []
        r = urllib.request.urlopen(self.site)
        html = r.read()
        parser = "html.parser"
        sp = BeautifulSoup(html, parser)
        for tag in sp.find_all("a"):
            url = tag.get("href")
            if url is None:
                continue
            if "html" in url:
                geturl.append(url)

        return geturl 

    '''アタマ3つのURLをhtmlに書き出す'''
    def create_html(self, _url):
        with open("test.html", "w", encoding="utf-8") as f:
            f.write("<html>" + "\n")
            f.write("<head>" + "\n")
            f.write("<meta charset=\"utf-8\">" + "\n")
            f.write("<title>super head line!!</title>" + "\n")
            f.write("</head>" + "\n")
            f.write("<h2>This page is \"{}\" headline</h><p>".format(self.site) + "\n")
            for i, new in enumerate(_url):
                l = "<h5><a href=" + "\"" + _url[i] + "\"" + " target=\"_blank\">URL:{}</a>".format(_url[i])
                l += "<br>"+ "\n"
                f.write(l);
                if i == 3:
                    break
            f.write("</html>" + "\n")


'''メイン処理'''
news = "https://srad.jp/"
url = []
url = Scraper(news).scrape()
Scraper(news).create_html(url)

'''デフォルトブラウザで開く'''
webbrowser.open_new_tab("./test.html")