More than 5 years have passed since last update.

BeautifulSoup を使用し、キーワードを含むページを抽出する

Last updated at 2019-07-09Posted at 2019-07-09

sample.py

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time

url_nikkei = 'https://www.nikkei.com/'
url_nikkei_business = 'https://www.nikkei.com/business/'
key_word = '環境'

response = requests.get(url_nikkei_business)
soup = BeautifulSoup(response.text, "html.parser")
regex = re.compile(r'm-miM\d{2}_title')
articles = soup.find_all('h3', {'class': regex})

time.sleep(3)

url_nikkei_articles = []
for article in articles:
    url_nikkei_article = article.find('a').get('href')
    url_nikkei_articles.append(url_nikkei_article)

url_list = []
title_list = []
for url in url_nikkei_articles:
    url_nikkei_article = urljoin(url_nikkei, url)
    
    response = requests.get(url_nikkei_article)
    soup = BeautifulSoup(response.text, "html.parser")

    regex = re.compile(r'cmn-section')
    temp = soup.find_all('div', {'class': regex})
    title = temp[0].find('span').string
    
    regex = re.compile(r'cmn-article_text')
    contents = temp[0].find('div', {'class': regex}).find_all('p')
    
    for content in contents:
        s = re.search(key_word, str(content))
        if s:
            url_list.append(url_nikkei_article)
            title_list.append(title)
            break
    
    time.sleep(3)

for i, title in enumerate(title_list):
    print(i+1, title)

for i, url in enumerate(url_list):
    print(i+1, url)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up