More than 3 years have passed since last update.

Google Colaboratory に登録しているスニペット（スクレイピング）

Last updated at 2020-11-17Posted at 2020-11-17

Beautifulsoup4

ベース

from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

url = "http://example.jp"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")


urljoin(url, "index.html")

セッション

with requests.Session() as s:

    r = s.get("http://example.jp", headers = headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

Pandas

import pandas as pd

df = pd.read_html("http://example.jp", header=0, index_col=0)

Selenium

!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options

import time

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome("chromedriver", options=options)
driver.implicitly_wait(10)

# メインのウインドウ
parent_window = driver.current_window_handle

driver.get("http://example.jp")

# URL表示
print(driver.current_url)

time.sleep(3)

# クリック
driver.find_element_by_link_text("XXXXX").click()

# ウインドウ切替
driver.switch_to.window(driver.window_handles[-1])

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up