pythonでwebスクライビング時の備忘録です。
from bs4 import BeautifulSoup
import reuest
import os
"""プロキシ対応"""
os.environ["https_proxy"] = "http://xxx.xx.xx.xx:8080"
url = "https://www.python.org/"
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
print(soup)
print("----------------------------------------------")
# python.org内の文字列のみを取得したい場合
name = soup.find_all("div", class_="introduction")
# name = soup.find_all("div", {"class": "introduction"}と記載しても良い。
name = name[0].text
print(name)
title = soup.find_all("title")
title = title[0].text
print(title)
結果
Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More
Welcome to Python.org