0
0

More than 3 years have passed since last update.

Python Scraping get_ranker_categories

Last updated at Posted at 2020-04-17

目標

ソースコード・ファイル

get_ranker_categories.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
import csv

# Open Browser
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') 
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

dictionary = "/Users/micksmith/home/work/eBay/Python/word_dictionary.csv"
url_exchange = "/Users/micksmith/home/work/eBay/Python/url_exchange.csv"

def get_categories():    
    categories_entire = []
    categories_entertainment = []
    categories_nerdy = []
    categories_channel = []

    items_entire = driver.find_elements_by_class_name('site__subItem')
    items_entertainment = driver.find_elements_by_class_name('-entertainment')
    items_nerdy = driver.find_elements_by_class_name('-nerdy')
    items_channel = driver.find_elements_by_class_name('-channels')
    #print(len(items))
    #items = set(items_entire) - set(items_channel)
    for item in items_entire:
        categories_entire.append(item.get_attribute("textContent"))

    for item in items_entertainment:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_entertainment.append(content.get_attribute("textContent"))

    for item in items_nerdy:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_nerdy.append(content.get_attribute("textContent"))

    for item in items_channel:
        #print(item.get_attribute("textContent"))
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_channel.append(content.get_attribute("textContent"))
    #print(categories_channel)
    #print(len(list(set(categories_entire) - set(categories_channel))))

    # print(categories_entire)
    # print(categories_entertainment)
    # print(categories_nerdy)
    # print(categories_channel)

    return categories_entire, categories_channel
    # return (list(set(categories_entire) - set(categories_channel)))
    # for item in items:
    #     categories = item.find_elements_by_class_name('site__subItem')
    #     for category in categories:
    #         print(category.get_attribute("textContent"))  

        # print(item.get_attribute("textContent"))
        # categories = item.find_elements_by_class_name('site__subItem')
        # for category in categories:
        #     print(category.text)

def exchange_words(word_dictionary):
    print("word_dictionary:", word_dictionary)
    word_before = []
    word_after = []
    word_results = []

    with open( dictionary, 'r') as f:
        reader = csv.reader(f)    
        for row in reader:
            word_before.append(row[0])
            word_after.append(row[1])

    for word in word_dictionary:
        for num in range(len(word_before)):
            if(word == word_before[num]):
                word = word_after[num]
                print(word)
        word_results.append(word)

    return word_results


if __name__ == "__main__":

    url = "https://www.ranker.com/"
    driver.get(url)
    entire, channel = get_categories()

    items = []
    list_of = []
    lists = []
    tags = []    

    # list_of = entertainment + nerdy
    # tags = list(set(item) - set(list_of))

    # list_of = exchange_words(list_of)
    # tags = exchange_words(tags)
    items = exchange_words(list(set(entire) - set(channel)))

    word_list_of = []
    word_lists = []

    with open(url_exchange, 'r') as f:
        reader = csv.reader(f)
        for row in reader:           
            word_list_of.append(row[0])
            word_lists.append(row[1])

    for item in items:
        for i in range(len(word_list_of)):
            if(item == word_list_of[i]):
                list_of.append(word_list_of[i])
            elif(item == word_lists[i]):
                lists.append(word_lists[i])

    tags = set(items) - set((list_of + lists))

    # print("list_of:", list_of)
    # print("lists:", lists)
    # print("tags:", tags)
    # # exchange 

    for item in list_of:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/list-of/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    for item in lists:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/lists/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    for item in tags:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/tags/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    # print(item)
    # print(list_of)
    # print(tags)
    # for category in categories:
    #     category = category.replace(" ","-")
    #     url = "https://www.ranker.com/list-of/" + category + "?ref=mainnav"
    #     #print("URL:", url)
    #     #driver.get(url)
    #     time.sleep(3)

    # df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
    # df.to_csv(Source_file, index=False)    
    driver.quit()
url_exchange.csv
"film","albums"
"tv","beverages"
"comics",""
"tech",""
"science",""
"cars",""
"arts",""
"books",""
word_dictionary.csv
"movies","film"
"celebrity","celebrities"
"watchworthy","what to watch"
"anime","anime underground"
"cartoons","animated"
"athletes","best athletes"
"family","parenting"
"career","jobs"
"automotive","cars"
"art","arts"
"deep thoughts","thought provoking"
"libations","alcohol"
"healthy eating","dieting"

結果

  • デモ
    get_ranker_categories copy.gif

  • コマンド

URL: https://www.ranker.com/list-of/arts?ref=mainnav
URL: https://www.ranker.com/list-of/tech?ref=mainnav
URL: https://www.ranker.com/list-of/tv?ref=mainnav
…
URL: https://www.ranker.com/tags/college-sports?ref=mainnav

分析

  • 各 URL の "hogehoge" に統一性が無いため、独自に分類して対応する( url_exchange.csv )

    • https://www.ranker.com/hogehoge/category?ref=mainnav
      • list_of
      • lists
      • tags
  • 各 URL の "category" について、取得した単語と異なる場合があるため、独自に変換表( word_dictionary.csv )を作成して対応する

    • https://www.ranker.com/hogehoge/category?ref=mainnav

課題

  • 変数名が識別しづらい
  • 各関数の機能が適切かどうか
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0