More than 5 years have passed since last update.

pythonと楽天APIで楽天市場の特定ジャンルに属する商品データをCSV化する。

Posted at 2019-09-14

流れ

スクレイピングでshopCodeを取得
shopcodeを使って楽天APIを叩いて結果を取得
CSV化して保存。

工夫した所

楽天市場のサイトは150ページまでしか開けない仕様になっているので、検索結果が(45件/ページ×150ページ=）6750件を下回るように工夫した。
楽天APIは一回につき最大3000件（1ページにつき最大30件で10ページまで可能）までしかデータを取得できないので、該当製品の数が3000件を下回るように工夫した。

スクレイピングでshopCodeを取得

shopcode.py

# 必要なライブラリを取得
from urllib import request
from bs4 import BeautifulSoup
import time
import csv
import requests
import urllib.parse
import time
import pickle
url_list=[]
number='検索結果のうちもっとも高額な商品の値段を50で割った値を代入してください。'
for n in range(number):
    m=50*n
    M=50*(n+1)
    for num in range (1,151):
        url = "https://search.rakuten.co.jp/search/mall/"+str(urllib.parse.quote('調べたい言葉'))+"/ジャンルId/?max="+str(M)+"&min="+str(m)+"&p="+str(num)
        #print(url)
        try:
            html = request.urlopen(url)
            time.sleep(2)
            try:
                soup = BeautifulSoup(html, "html.parser")
                if num!=1 and soup.find('a',class_="item -previous previousPage") == None:
                    print('途中で終わった。')
                    break
                else:
                    for i in soup.find_all('h2'):
                            if 'redirect_rpp' not in i.find('a').get('href'):
                                url_list.append(i.find('a').get('href'))
                                print(url)
                                print(len(url_list))
            except:
                print("error")
        except: 
            print("ページが見つかりません")
            print(url)
print('url_listの長さ')
print(len(url_list))

# 取得したリストを保存しておきます。
f = open('url_list.txt', 'wb')
pickle.dump(url_list, f)

以上のコードをpython shopcode.pyで実行してください。

作った商品URLリストを処理する。

shori.py


import pickle
import collections

# まず、先ほど作ったURLリストを取り出す。
f = open("./url_list.txt","rb")
url_list = pickle.load(f)

re_list=[]

for i in url_list:
    result = re.sub('https://item.rakuten.co.jp/', "", i)
    pattern = "(.*)/(.*)"
    d = re.search(pattern, result[:-1])
    re_list.append(d.group(1))
    #print(d.group(2))
re_list1=list(set(re_list))
c = collections.Counter(re_list)
print('店舗ごとの商品数')
print(c)
print('もっとも商品が多いshopとその商品数')
print(max(c.values()))
print('重複を除く前のshop数')
print(len(re_list))
print('重複を除いたshop数')
print(len(re_list1))

# re_list.txtとして重複を除いたshopのリストを保存しておきます。
f = open('re_list.txt', 'wb')
pickle.dump(re_list1, f)

楽天APIを叩いて情報を取得

api.py

import csv
import sys
import codecs
import math
import random
import requests
from time import sleep
import time
import re
url = 'https://app.rakuten.co.jp/services/api/IchibaItem/Search/20170706'
doc_list=[]#最終的な結果はここに入る。
keywords=["美味しい","辛い","しょっぱい"]
type_dic={1003741:'軟水',1003742:'中硬水',1003743:'硬水'}
for shop in re_list:
    #page=1
    payload = {
        'applicationId': 'アプリケーションIDを入力',
        'hits': 30,  # 一度のリクエストで返してもらう最大個数（MAX30)
        "genreId":'欲しいジャンルのIDを入力',
        'shopCode': shop,  # ショップID
        'page': 1,  # 何ページ目か
       
    }
    try:
        r = requests.get(url, params=payload)
        time.sleep(1.5)
        resp = r.json()
        total = int(resp['count'])
        Max = int((total-0.1)//30 +1)
        for i in resp['Items']:

            doc_dic={'name':"","url":"","item_code":"","image":"","price":"","shopname":"","tags":"","explanation":"","feature":""}
            inkeywords=[]
            
            typelist=[]
            
            
            item = i['Item']
            doc_dic["name"]= item['itemName']
            doc_dic['url']=item['itemUrl']
            doc_dic['item_code']=item['itemCode']
            doc_dic['image']=item['mediumImageUrls'][0]['imageUrl']
            doc_dic['price']=item['itemPrice']
            doc_dic['shopname']=item['shopName']
            for word in keywords:
                if word in item["itemCaption"]:
                    inkeywords.append(word)
            if len(inkeywords) > 0:
                doc_dic['feature']=inkeywords
            doc_dic['explanation']=item["itemCaption"]
            doc_dic['tags']=item['tagIds'] 
            #タイプ（タグから要素を取り出すイメージ）
            for tag in item['tagIds']:
                if tag in list(type_dic.keys()):
                    typelist.append(type_dic[tag])
            if len(typelist) > 0:
                doc_dic['type'] = typelist
            
            doc_list.append(doc_dic)
            print(len(doc_list))
    except:
        print('無理だった！！！！')
    
    #pageが2以上
    if Max>1:
        for pagenum in range(2,Max+1):
            payload = {
            'applicationId': 'アプリケーションIDを入力',
            'hits': 30,  # 一度のリクエストで返してもらう最大個数（MAX30)
            "genreId":'欲しいジャンルのIDを入力',
            'shopCode': shop,  # ショップID
            'page': pagenum,  # 何ページ目か
            }
            try:
                r = requests.get(url, params=payload)
                time.sleep(1.5)
                resp = r.json()
                total = int(resp['count'])
                Max = int((total-0.1)//30 +1)
                
                for i in resp['Items']:
                    doc_dic={'name':"","url":"","item_code":"","image":"","price":"","shopname":"","tags":"","explanation":"","feature":""}
                    inkeywords = []
                    typelist = []

                    
                    item = i['Item']
                    doc_dic["name"] = item['itemName']
                    doc_dic['url'] = item['itemUrl']
                    doc_dic['item_code'] = item['itemCode']
                    doc_dic['image'] = item['mediumImageUrls'][0]['imageUrl']
                    doc_dic['price'] = item['itemPrice']
                    doc_dic['shopname'] = item['shopName']
                    for word in keywords:
                        if word in item["itemCaption"]:
                            inkeywords.append(word)
                    if len(inkeywords) > 0:
                        doc_dic['feature'] = inkeywords
                    doc_dic['explanation'] = item["itemCaption"]
                    doc_dic['tags'] = item['tagIds']

                    #タイプ
                    for tag in item['tagIds']:
                        if tag in list(type_dic.keys()):
                            typelist.append(type_dic[tag])
                    if len(typelist) > 0:
                        doc_dic['type'] = typelist

                    doc_list.append(doc_dic)
                    print(len(doc_list))
            except:
                print('無理だった！！！')

# CSV化する。
import pandas as pd
data=pd.DataFrame(doc_list)
data=data.drop_duplicates(subset='item_code')
data.to_csv('info.csv',encoding='utf_8_sig')

参考文献

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up