関連記事
環境
- python3.5.2
- PhantomJS 2.1.1
- CentOS7.1
npmのインストール
以下の記事を参照
npmのインストール手順
依存関係のインストール
$ sudo yum install -y bzip2
PhantomJSのインストール
$ npm install --save phantomjs
selenium
$ pip install selenium
nose
$ pip install nose
sample
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
# account
email = 'user'
password = 'password'
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)
#############
# get html
#############
# login page
login_page_url = 'http://127.0.0.1/sign_in'
driver.get(login_page_url)
# ページが読み込まれるまで待機
wait.until(ec.presence_of_all_elements_located)
# 現在のURLを確認
nose.eq_('http://127.0.0.1:8080/login', driver.current_url)
#############
# login
#############
# button click
show_signin = driver.find_element_by_id('showSignIn')
show_signin.click()
# email
login_xpath = '//*[@id="user_email"]'
# 対象の要素が見えるまで待機
wait.until(ec.visibility_of_element_located((By.XPATH, login_xpath)))
# email form入力
login_id_form = driver.find_element_by_xpath(login_xpath)
login_id_form.clear()
login_id_form.send_keys(email)
# password
password_xpath = '//*[@id="user_password"]'
# 対象の要素が見えるまで待機
wait.until(ec.visibility_of_element_located((By.XPATH, password_xpath)))
# password form入力
password_form = driver.find_element_by_xpath(password_xpath)
password_form.clear()
password_form.send_keys(password)
# submit
submit_xpath = '//*[@id="new_user"]/div[4]/input'
driver.find_element_by_xpath(submit_xpath).click()
#############
# result
#############
driver.get('http://127.0.0.1/users/edit')
# ページが読み込まれるまで待機
wait.until(ec.presence_of_all_elements_located)
# 現在のURLを確認
nose.eq_('http://127.0.0.1:8080/users/edit', driver.current_url)
# ログインできているか、画面の表示要素で確認
user_email = driver.find_element_by_xpath('//*[@id="user_email"]').get_attribute("value")
nose.eq_(email, user_email)
BeautifulSoap
BeautifulSoapを組み合わせて使うと、htmlのパースが楽にできます。
$ pip install beautifulsoup4
以下のparserも入れておいたほうが無難です。
参考:Beautiful Soup 4.x では parser を明示指定しよう
$ pip install html5lib
$ pip install lxml
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
from bs4 import BeautifulSoup
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
#############
# parse html
#############
html = BeautifulSoup(data)
print(html)
print(html.title)
print(html.title.string)
print(html.find('h1'))
print(html.find('select',{'id':'hoge'}))
pandasでtableをparse
# -*- coding:utf-8 -*-
import pandas as pd
url = 'http://stocks.finance.yahoo.co.jp/stocks/history/?code=998407.O'
tables = pd.io.html.read_html(url, flavor='bs4')
print(tables[1])
urlだけでなく、htmlソースからもparseできます。
# -*- coding:utf-8 -*-
import pandas as pd
html = '''
<html>
<body>
<table>
<tr><td>sample1</td></tr>
<tr><td>sample2</td></tr>
<tr><td>sample3</td></tr>
<tr><td>sample4</td></tr>
</table>
</body>
</html>
'''
tables = pd.io.html.read_html(html, flavor='bs4')
print(tables[0])
PhantomJS with pandas
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from bs4 import BeautifulSoup
import pandas as pd
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
# PhantomJS本体のパス
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
# 5秒待機
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
# parse
soup = BeautifulSoup(data,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0])