More than 3 years have passed since last update.

【Pythonスクレイピング】特定のキーワードが含まれるサイトのURLとタイトルをテキストファイルへ出力

Python

Last updated at 2020-07-06Posted at 2020-07-05

概要

Google 検索を実施し、ヒットしたページ内の title og:description h1 ~ h4 のいずれかに特定のキーワードが含まれる場合、対象ページのタイトルと URL をタブ区切りでテキストファイルに出力します。
URL が既にテキストファイル内に含まれている場合、処理をスキップします。

※Google はスクレイピングを禁止しているため、サーバーに負荷をかけず、個人的な範囲で、自己責任でご利用ください。
https://support.google.com/webmasters/answer/66357

Pythonバージョン

3.8.2

実行手順

venv などで仮想環境を構築後、

pip install -r requirements.txt
python main.py

コード

requirements.txt

beautifulsoup4 == 4.9.1
requests == 2.24.0

settings.py

settings = {
    # Google 検索に使用するキーワード
    'google_search_keywords': ['医療', 'コロナ'],

    # 検索件数
    'google_search_num': 10,

    # ヒットしたページ内から探すキーワード
    'search_keywords_in_page': ['医療']
}

main.py

import urllib.parse
import re

import requests
import bs4

from settings import settings
from output import OutputText


def get_ogdesc_from_soup(soup: bs4.BeautifulSoup) -> str:
    """
    BeautifulSoup インスタンスから
    <meta property="og:description" content="...">
    を探し、content の内容を返す。
    見つからなかった場合、空文字を返す。
    """
    og_desc = soup.find('meta', attrs={'property': 'og:description', 'content': True})
    if og_desc:
        return og_desc['content']
    return ''


def get_href_from_soup(soup: bs4.BeautifulSoup):
    href = soup.get('href')
    href = re.search('(http)(.+)(&sa)', href).group()[0:-3]  # 不要な文字列除去
    href = urllib.parse.unquote(href)  # デコード
    return href


def do_google_search(keywords: [str], search_num: int) -> [str]:
    """
    keywords で Google 検索を実施し、
    ヒットした URL をリストで返す
    """
    # Google 検索の実施
    url = 'https://www.google.co.jp/search'
    params = {
        'hl': 'ja',
        'num': search_num,
        'q': ' '.join(keywords)
    }
    response = requests.get(url, params=params)

    # ヒットした URL をリストで返す
    # `.kCrYT` は Google の仕様変更により修正が必要な場合あり
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    soups = soup.select('.kCrYT > a')
    return [get_href_from_soup(soup) for soup in soups]


def main():
    output_text = OutputText('output.txt')
    urls = do_google_search(settings['google_search_keywords'], settings['google_search_num'])

    for url in urls:
        # テキストファイル内に既に URL が含まれているなら処理をスキップ
        if url in output_text.get_urls():
            continue

        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            response.raise_for_status()
        except:
            # 接続エラーが発生した場合は処理をスキップ
            continue

        soup = bs4.BeautifulSoup(response.content, 'html.parser')

        titles = [a.text for a in soup.select('title')]
        desc = get_ogdesc_from_soup(soup)
        h1s = [a.text for a in soup.select('h1')]
        h2s = [a.text for a in soup.select('h2')]
        h3s = [a.text for a in soup.select('h3')]
        h4s = [a.text for a in soup.select('h4')]

        # キーワードがページ内に含まれていないなら処理をスキップ
        no_keyword = True
        for keyword in settings['search_keywords_in_page']:
            for text in titles + [desc] + h1s + h2s + h3s + h4s:
                if keyword in text:
                    no_keyword = False
        if no_keyword:
            continue

        # テキストファイルに書き込む
        title = '**No title**' if len(titles) <= 0 else titles[0].strip().replace('\n', '')
        output_text.write(title, url)

    # 読みやすい形式のテキストファイルを出力
    output_text.output_readable_file()


if __name__ == '__main__':
    main()

output.py

import myutil as u
import os


class OutputText:
    file_path = None

    def __init__(self, file_path):
        self.file_path = file_path

        if not os.path.isfile(file_path):
            file = open(self.file_path, 'w', encoding='utf-8')
            file.close()

    def write(self, title, url):
        with open(self.file_path, mode='a', encoding='utf-8') as f:
            u.write_with_tab(f, title, url)
            f.write('\n')

    def get_urls(self):
        lines = self.get_lines()
        return [self.get_url(line) for line in lines]

    def output_readable_file(self):
        file = self.file_path.replace('.txt', '_readable.txt')
        with open(file, mode='w', encoding='utf-8') as f:
            lines = self.get_lines()
            for line in lines:
                f.write(self.get_title(line) + '\n' + self.get_url(line) + '\n')
                f.write('\n------------------------------\n\n')

    def get_lines(self):
        with open(self.file_path, mode='r', encoding='utf-8') as f:
            text = f.read()
            lines = text.strip().split('\n')
            return lines

    def get_title(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[0] if len(texts_in_line) >= 1 else ''

    def get_url(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[1] if len(texts_in_line) >= 2 else ''

myutil.py

def write_with_tab(file, *strings):
    """
    ファイルにタブ区切りで文字列を書き込む
    """
    for i, string in enumerate(strings):
        file.write(string)
        if i != len(strings) - 1:  # 最後のループでないなら
            file.write('\t')
    return file

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up