More than 5 years have passed since last update.
amazon注文一覧と明細を吐き出す

Last updated at 2019-03-18Posted at 2019-03-14
環境：

firefox
mozilla geckodriver(https://github.com/mozilla/geckodriver/releases)
python3.x, Mac
使い方:

13-14行目：
amazonPassword="password"```
を個人のアカウントとパスワードに差し替え。
118行目：```years = range(2009, 2020)```  ← 抽出したい年範囲に

## outputエクセル：
A列：注文番号
C列：購入当時実際に支払った金額（送料など込み）
E列(checkPrice、送料別途)＝D列明細内の各商品単価＊数量
E列とC列合致しない場合、下記の原因が考えられる：
・送料込みか別途発生する
・商品単価の変動
<img width="658" alt="output.png" src="https://qiita-image-store.s3.amazonaws.com/0/199800/6d7dfe0e-71a3-871b-9e58-b5cb50397d94.png">

```python
import os
import re
import math
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
import requests
from tqdm import tqdm
import time

amazonAccount = "account"
amazonPassword = "password"

thisPath = os.path.dirname(os.path.abspath(__file__))
orders = {}  # 注文明細一覧
invisibleList = []  # ４つ以上の注文の場合、一旦URLを格納

options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(executable_path=os.path.join(thisPath, "geckodriver"), firefox_options=options)

# login
browser.get('https://www.amazon.co.jp/ap/signin?ie=UTF8&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.co.jp%2F%3Fref_%3Dnav_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=jpflex&_encoding=UTF8&openid.mode=checkid_setup&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&ignoreAuthState=1&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&ie=UTF8&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&fromAuthPrompt=1')
browser.find_element_by_name("email").send_keys(amazonAccount)
browser.find_element_by_name("password").send_keys(amazonPassword)
browser.find_element_by_name("rememberMe").click()
WebDriverWait(browser, 10).until(lambda x: x.find_element_by_id("signInSubmit")).click()

# 注文履歴page
browser.get('https://www.amazon.co.jp/gp/your-account/order-history')
print(browser.title)

s = requests.Session()
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
for cookie in browser.get_cookies():
    s.cookies.set(cookie['name'], cookie['value'])
s.headers.clear()


def getProductionNameAndNum(string):
    tmp = string.split("、数量：")
    if len(tmp) == 1:
        return [tmp[0], 1]
    else:
        return [tmp[0], int(tmp[1])]


def getSummary(arr):
    a = list(map(lambda x: re.sub(r'[\n\s]', '', x.text), arr))
    a[1] = int(re.sub(r'[,\s￥]', '', a[1]))
    return a  # [orderDate,price,orderNumber]


def getDetails(elements):
    results = []
    for e in elements:
        appFlag = "shipment" not in e.attrs['class']
        for d in e.select('.a-fixed-left-grid-col.a-col-right'):
            if appFlag:
                results.append({
                    '商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', d.select(".a-row")[0].text),
                    '数量': 1,
                    '単価': 0})
                break
            tmp = getProductionNameAndNum(d.find(class_="a-link-normal").text)
            results.append({
                '商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', tmp[0]),
                '数量': tmp[1],
                '単価': int(re.sub(r'[,\s\n￥]', '', d.select(".a-size-small.a-color-price")[0].text))
            })

    checkPrice = sum(list(map(lambda x: x['単価'] * x['数量'], results)))
    return [results, checkPrice]


def parseHtml(html):
    orderHtmls = html.select(".a-box-group.a-spacing-base.order")  # all orders(max=10)
    for o in orderHtmls:
        # header
        headerHtml = o.select('.a-box.a-color-offset-background.order-info')[0]
        header = getSummary(headerHtml.select('.a-color-secondary.value'))
        orderNumber = header[2]
        orders[orderNumber] = {'注文日': header[0], '合計': header[1]}

        # "**個すべての商品を表示"の有無を確認。ありの場合は独自のページで
        invisible = o.select('.a-size-medium.a-link-emphasis')
        if len(invisible):
            invisibleList.append({"orderNumber": orderNumber, "url": invisible[0].get("href")})
            continue

        # details
        if o.select('.a-box.shipment'):  # production
            detailBox = o.select('.a-box.shipment')
        else:  # android app
            # 厳密で div class="a-box"
            detailBox = o.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['a-box'])

        d = getDetails(detailBox)
        orders[orderNumber]['明細'] = d[0]
        orders[orderNumber]['checkPrice'] = d[1]


def makeOrderUrls(orderNum, year):
    year = str(year)
    urls = []
    pages = math.ceil(orderNum / 10)
    baseUrl = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_{}?ie=UTF8&orderFilter=year-{}&search=&startIndex={}"
    for p in range(0, pages):
        urls.append(baseUrl.format(p + 1, year, p * 10))
    return urls


years = range(2009, 2020)  # order period
for y in tqdm(years):
    first_page_url = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_1?ie=UTF8&orderFilter=year-{}&search=&startIndex=0".format(str(y))
    html = BeautifulSoup(s.get(first_page_url, headers=headers).text, "lxml")
    num_orders = int(html.find(class_='num-orders').text.replace("件", ""))
    #print("year:{}, order numbers: {}".format(y,num_orders))
    if num_orders == 0:
        continue
    page_urls = makeOrderUrls(num_orders, y)
    parseHtml(html)  # 1頁目
    time.sleep(1)
    for i in tqdm(range(1, len(page_urls))):  # 2頁以降
        # print(page_urls[i])
        html = BeautifulSoup(s.get(page_urls[i], headers=headers).text, "lxml")
        parseHtml(html)
        time.sleep(1)

# ４つ以上の商品は、独自のページで検索
for x in invisibleList:
    r = s.get("https://www.amazon.co.jp" + x['url'], headers=headers)
    html = BeautifulSoup(r.text, "lxml")
    d = getDetails(html.select('.a-box.shipment'))
    orders[x['orderNumber']]['明細'] = d[0]
    orders[x['orderNumber']]['checkPrice'] = d[1]
    time.sleep(1)

browser.quit()

df = pd.DataFrame.from_dict(orders, orient="index")
writer = pd.ExcelWriter("output.xlsx")
df.to_excel(writer, index=True, header=True, sheet_name="orders")
writer.save()
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up