More than 5 years have passed since last update.

Python Scraping eBay

Posted at 2020-04-13

目標

以下の項目を CSV ファイルから読み込む
- 検索キーワード
- 取得ページ
- 商品価格の最小値
- 商品価格の最大値
eBay にて商品情報を取得する
取得した商品情報を元に、以下のファイルを出力する
- CSV ファイル
  - 商品名
  - 商品状態
  - 販売価格
  - 送料
  - 合計額 (販売価格 + 送料)
- 辞書ファイル
  - 商品名に登場した単語の頻出度
  - 商品名に登場した単語の品詞分類
- グラフ
  - 合計額の度数分布(ヒストグラム)

ソースコード

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import csv
import pandas as pd
import datetime
from collections import Counter
import collections
import numpy as np
import matplotlib as mpl
mpl.use('TkAgg')  # or whatever other backend that you want
import matplotlib.pyplot as plt
import nltk
import time
import random
import re

# # Input Keywordkoda
# print("Input Keyword:")
# keyword = input().strip().replace(" ","+")

# # Input PageNumber
# print("Input PageNumber")
# page_num = int(input())
# # Input MaxPrice
# print("Input MaxPrice")
# x_max = int(input())

# Open Browser
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# options.add_argument('–-disable-dev-shm-usage')
# options.add_argument('--proxy-server="direct://"') # Proxy経由ではなく直接接続する
# options.add_argument('--disable-web-security')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

names = []
statuses = []
prices = []
shippings = []
amounts = []
displayed_results = 200


Source_file = "/Users/micksmith/home/work/eBay/Python/movie anime.csv"
CSV_file_path = "/Users/micksmith/home/work/eBay/Python/CSV/"
keyword_file_path = "/Users/micksmith/home/work/eBay/Python/Keyword/"
analysis_file_path = "/Users/micksmith/home/work/eBay/Python/Analysis/"
hist_file_path = "/Users/micksmith/home/work/eBay/Python/Hist/"


with open(Source_file, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        keyword = row[0].strip().replace(" ","+")
        page_num = int(row[1])
        x_min = int(row[2])
        x_max = int(row[3])
        #print(page_num)
        #print(x_max)
        #driver.implicitly_wait(random.randint(10,20))
        
        # number_of_search
        url  = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw={}&_sacat=0&_ipg={}'.format(keyword, displayed_results)
        driver.get(url)
        
        number_of_search = driver.find_element_by_css_selector('#mainContent > div.s-answer-region.s-answer-region-center-top > div > div.clearfix.srp-controls__row-2 > div > div.srp-controls__control.srp-controls__count > h1 > span:nth-child(1)').text
        number_of_search = int(re.sub("\\D", "", number_of_search))
        if(page_num * displayed_results > number_of_search):
            page_num = number_of_search // displayed_results + 1
        print("number_of_search:", number_of_search)
        print("page_num:", page_num)
        time.sleep(3)

        print(keyword, page_num, x_max)
        for i in range (1,page_num+1):
            url  = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_pgn={}'.format(keyword, i)
            
            # driver.implicitly_wait(random.randint(1,3))
            
            #driver.implicitly_wait(1)
            driver.get(url)
            #print("debug")
            
            print("Page_num:", i)
            time.sleep(random.randint(1,3))
            
            items = driver.find_elements_by_class_name('s-item__info.clearfix')

            for item in items:
                # get Name
                try:
                    name = item.find_element_by_class_name('s-item__title').text
                    name = name.replace("NEW LISTING", "")
                    names.append(name)
                except:
                    names.append(" ")
                print(name)
                # #get Status
                try:
                    status = item.find_element_by_class_name('SECONDARY_INFO').text
                    statuses.append(status)
                except:
                    statuses.append(" ")
                # #get Price
                try:
                    price = item.find_element_by_class_name('s-item__price').text
                    if( 'to' in price):
                        price = price.replace(price, '0')
                        #print(price)
                    price = int(re.sub("\\D", "", price))
                    prices.append(price)
                except:
                    prices.append(" ")
                print(price)
                
                # #get ShippingCost
                try:
                    shipping = item.find_element_by_class_name('s-item__logisticsCost').text
                    shipping = int(re.sub("\\D", "", shipping))
                    # shipping = int(shipping)
                    shippings.append(shipping)
                except:
                    shipping = 0 
                    shippings.append(shipping)
                print(shipping)            
                
                amounts.append(price + shipping)
                #print(amount)

        #print(len(names))
        # TitleExtraction
        title = names
        titles = ' '.join(title)
        keyword = keyword.replace("+"," ")
        #print(mojiretsu)
        counter = collections.Counter(titles.split())
        d = counter.most_common(200)
        date = datetime.datetime.today().strftime("%Y%m%d")
        keyword_file_name = keyword_file_path +keyword + "_" + str(page_num) + "_" + date + ".txt"
        with open(keyword_file_name, 'w') as f:
            for key, value in d:
                f.write('{0} {1}\n'.format(key, value))

        # MorphologicalAnalysis
        morph = nltk.word_tokenize(titles)
        pos = nltk.pos_tag(morph)
        entitles = nltk.chunk.ne_chunk(pos)
        analysis_file_name = analysis_file_path  + keyword + "_" + str(number_of_search) + "_" + date + ".txt" 
        #print(type(entitles))
        with open(analysis_file_name, mode="w") as f:
            f.write(str(entitles))

        df = pd.DataFrame()
        df['name'] = names
        df['status'] = statuses
        df['price'] = prices
        df['shippingcost'] = shippings
        df['amount'] = amounts

        date = datetime.datetime.today().strftime("%Y%m%d")
        csv_file_name = CSV_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".csv"

        df.to_csv(csv_file_name)


        y_min = 0       # MinimumFrequency
        y_max = number_of_search # MaximumFrequency
        interval = (x_max - x_min) // 20 # Interval
        bin_num =  (x_max - x_min) // interval # bins

        # Hists
        # plt.xlim(x_min, x_max)
        # plt.ylim(y_min, y_max)
        plt.title(keyword + "_" + str(number_of_search) + "_" + date)
        plt.xlabel("Price")
        plt.ylabel("Frequency")
        plt.xticks(np.arange(x_min, x_max, interval), rotation = 90)
        plt.hist(prices, range=(x_min, x_max), bins = bin_num)
        plt.savefig(hist_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".png")
        #plt.show()

    driver.quit()

結果

分析

以下のエラーメッセージが頻発する

 Traceback (most recent call last):
  File "/Users/UserName/home/work/eBay/Python/eBay_Scraping.py", line 89, in <module>
    driver.get(url)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

原因として、以下のことが考えられる
- 同一セッションID で連続してアクセスすると、サーバ上でブロックされる
- 対処法について検討中

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up