LoginSignup
52
69

More than 5 years have passed since last update.

Python + PhantomJSでスクレイピング

Last updated at Posted at 2016-10-28

関連記事

環境

  • python3.5.2
  • PhantomJS 2.1.1
  • CentOS7.1

npmのインストール

以下の記事を参照
npmのインストール手順

依存関係のインストール

$ sudo yum install -y bzip2

PhantomJSのインストール

$ npm install --save phantomjs

selenium

$ pip install selenium

nose

$ pip install nose

sample

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose

# account
email = 'user'
password = 'password'


#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)

#############
# get html
#############
# login page
login_page_url = 'http://127.0.0.1/sign_in'
driver.get(login_page_url)
# ページが読み込まれるまで待機
wait.until(ec.presence_of_all_elements_located)
# 現在のURLを確認
nose.eq_('http://127.0.0.1:8080/login', driver.current_url)

#############
# login
#############
# button click
show_signin = driver.find_element_by_id('showSignIn')
show_signin.click()


# email
login_xpath = '//*[@id="user_email"]'
# 対象の要素が見えるまで待機
wait.until(ec.visibility_of_element_located((By.XPATH, login_xpath)))

# email form入力
login_id_form = driver.find_element_by_xpath(login_xpath)
login_id_form.clear()
login_id_form.send_keys(email)

# password
password_xpath = '//*[@id="user_password"]'
# 対象の要素が見えるまで待機
wait.until(ec.visibility_of_element_located((By.XPATH, password_xpath)))

# password form入力
password_form = driver.find_element_by_xpath(password_xpath)
password_form.clear()
password_form.send_keys(password)

# submit
submit_xpath = '//*[@id="new_user"]/div[4]/input'
driver.find_element_by_xpath(submit_xpath).click()


#############
# result
#############
driver.get('http://127.0.0.1/users/edit')
# ページが読み込まれるまで待機
wait.until(ec.presence_of_all_elements_located)
# 現在のURLを確認
nose.eq_('http://127.0.0.1:8080/users/edit', driver.current_url)
# ログインできているか、画面の表示要素で確認
user_email = driver.find_element_by_xpath('//*[@id="user_email"]').get_attribute("value")
nose.eq_(email, user_email)

BeautifulSoap

BeautifulSoapを組み合わせて使うと、htmlのパースが楽にできます。

$ pip install beautifulsoup4

以下のparserも入れておいたほうが無難です。
参考:Beautiful Soup 4.x では parser を明示指定しよう

$ pip install html5lib
$ pip install lxml
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
from bs4 import BeautifulSoup

#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)

#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')

#############
# parse html
#############
html = BeautifulSoup(data)
print(html)
print(html.title)
print(html.title.string)
print(html.find('h1'))
print(html.find('select',{'id':'hoge'}))

pandasでtableをparse

# -*- coding:utf-8 -*-
import pandas as pd

url = 'http://stocks.finance.yahoo.co.jp/stocks/history/?code=998407.O'
tables = pd.io.html.read_html(url, flavor='bs4')
print(tables[1])

urlだけでなく、htmlソースからもparseできます。

# -*- coding:utf-8 -*-
import pandas as pd

html = '''
<html>
<body>
<table>
<tr><td>sample1</td></tr>
<tr><td>sample2</td></tr>
<tr><td>sample3</td></tr>
<tr><td>sample4</td></tr>
</table>
</body>
</html>
'''

tables = pd.io.html.read_html(html, flavor='bs4')
print(tables[0])

PhantomJS with pandas

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from bs4 import BeautifulSoup
import pandas as pd

#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
    "phantomjs.page.settings.userAgent" : user_agent,
    'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)

#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')

# parse
soup = BeautifulSoup(data,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0])

参考

52
69
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
52
69