##環境:
firefox
mozilla geckodriver(https://github.com/mozilla/geckodriver/releases)
python3.x, Mac
##使い方:
13-14行目:
amazonPassword="password"```
を個人のアカウントとパスワードに差し替え。
118行目:```years = range(2009, 2020)``` ← 抽出したい年範囲に
##outputエクセル:
A列:注文番号
C列:購入当時実際に支払った金額(送料など込み)
E列(checkPrice、送料別途)=D列明細内の各商品単価*数量
E列とC列合致しない場合、下記の原因が考えられる:
・送料込みか別途発生する
・商品単価の変動
<img width="658" alt="output.png" src="https://qiita-image-store.s3.amazonaws.com/0/199800/6d7dfe0e-71a3-871b-9e58-b5cb50397d94.png">
```python
import os
import re
import math
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
import requests
from tqdm import tqdm
import time
amazonAccount = "account"
amazonPassword = "password"
thisPath = os.path.dirname(os.path.abspath(__file__))
orders = {} # 注文明細一覧
invisibleList = [] # 4つ以上の注文の場合、一旦URLを格納
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(executable_path=os.path.join(thisPath, "geckodriver"), firefox_options=options)
# login
browser.get('https://www.amazon.co.jp/ap/signin?ie=UTF8&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.co.jp%2F%3Fref_%3Dnav_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=jpflex&_encoding=UTF8&openid.mode=checkid_setup&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&ignoreAuthState=1&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&ie=UTF8&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&fromAuthPrompt=1')
browser.find_element_by_name("email").send_keys(amazonAccount)
browser.find_element_by_name("password").send_keys(amazonPassword)
browser.find_element_by_name("rememberMe").click()
WebDriverWait(browser, 10).until(lambda x: x.find_element_by_id("signInSubmit")).click()
# 注文履歴page
browser.get('https://www.amazon.co.jp/gp/your-account/order-history')
print(browser.title)
s = requests.Session()
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
for cookie in browser.get_cookies():
s.cookies.set(cookie['name'], cookie['value'])
s.headers.clear()
def getProductionNameAndNum(string):
tmp = string.split("、数量:")
if len(tmp) == 1:
return [tmp[0], 1]
else:
return [tmp[0], int(tmp[1])]
def getSummary(arr):
a = list(map(lambda x: re.sub(r'[\n\s]', '', x.text), arr))
a[1] = int(re.sub(r'[,\s¥]', '', a[1]))
return a # [orderDate,price,orderNumber]
def getDetails(elements):
results = []
for e in elements:
appFlag = "shipment" not in e.attrs['class']
for d in e.select('.a-fixed-left-grid-col.a-col-right'):
if appFlag:
results.append({
'商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', d.select(".a-row")[0].text),
'数量': 1,
'単価': 0})
break
tmp = getProductionNameAndNum(d.find(class_="a-link-normal").text)
results.append({
'商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', tmp[0]),
'数量': tmp[1],
'単価': int(re.sub(r'[,\s\n¥]', '', d.select(".a-size-small.a-color-price")[0].text))
})
checkPrice = sum(list(map(lambda x: x['単価'] * x['数量'], results)))
return [results, checkPrice]
def parseHtml(html):
orderHtmls = html.select(".a-box-group.a-spacing-base.order") # all orders(max=10)
for o in orderHtmls:
# header
headerHtml = o.select('.a-box.a-color-offset-background.order-info')[0]
header = getSummary(headerHtml.select('.a-color-secondary.value'))
orderNumber = header[2]
orders[orderNumber] = {'注文日': header[0], '合計': header[1]}
# "**個すべての商品を表示"の有無を確認。ありの場合は独自のページで
invisible = o.select('.a-size-medium.a-link-emphasis')
if len(invisible):
invisibleList.append({"orderNumber": orderNumber, "url": invisible[0].get("href")})
continue
# details
if o.select('.a-box.shipment'): # production
detailBox = o.select('.a-box.shipment')
else: # android app
# 厳密で div class="a-box"
detailBox = o.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['a-box'])
d = getDetails(detailBox)
orders[orderNumber]['明細'] = d[0]
orders[orderNumber]['checkPrice'] = d[1]
def makeOrderUrls(orderNum, year):
year = str(year)
urls = []
pages = math.ceil(orderNum / 10)
baseUrl = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_{}?ie=UTF8&orderFilter=year-{}&search=&startIndex={}"
for p in range(0, pages):
urls.append(baseUrl.format(p + 1, year, p * 10))
return urls
years = range(2009, 2020) # order period
for y in tqdm(years):
first_page_url = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_1?ie=UTF8&orderFilter=year-{}&search=&startIndex=0".format(str(y))
html = BeautifulSoup(s.get(first_page_url, headers=headers).text, "lxml")
num_orders = int(html.find(class_='num-orders').text.replace("件", ""))
#print("year:{}, order numbers: {}".format(y,num_orders))
if num_orders == 0:
continue
page_urls = makeOrderUrls(num_orders, y)
parseHtml(html) # 1頁目
time.sleep(1)
for i in tqdm(range(1, len(page_urls))): # 2頁以降
# print(page_urls[i])
html = BeautifulSoup(s.get(page_urls[i], headers=headers).text, "lxml")
parseHtml(html)
time.sleep(1)
# 4つ以上の商品は、独自のページで検索
for x in invisibleList:
r = s.get("https://www.amazon.co.jp" + x['url'], headers=headers)
html = BeautifulSoup(r.text, "lxml")
d = getDetails(html.select('.a-box.shipment'))
orders[x['orderNumber']]['明細'] = d[0]
orders[x['orderNumber']]['checkPrice'] = d[1]
time.sleep(1)
browser.quit()
df = pd.DataFrame.from_dict(orders, orient="index")
writer = pd.ExcelWriter("output.xlsx")
df.to_excel(writer, index=True, header=True, sheet_name="orders")
writer.save()