import requests
from bs4 import BeautifulSoup
from pprint import pprint
import re
def content_get(url):
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')
return soup
def cleanhtml(raw_html):
cleanr = re.compile("<.*?>")
cleantext = re.sub(cleanr, "", raw_html)
return cleantext
url = "https://{id}.livedoor.blog/"
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')
sidebody = soup.find_all('div', {'class': "sidebody"})
pprint(sidebody)
urls = []
for i in sidebody:
for a_tag in i.find_all('a'):
url = a_tag.get('href')
if url:
urls.append(url)
pprint(urls)
for i in urls:
true_url = i
true_content = content_get(true_url)
true_body = true_content.find('div', {'class': 'article-body-inner'})
print(true_body.get_text())
ライブドアブログの記事全部文字列で引っこ抜く
Last updated at Posted at 2023-12-04
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme