More than 5 years have passed since last update.

Python Scraping get_ranker_categories

Last updated at 2020-04-17Posted at 2020-04-17

目標

以下のサイトから、各小カテゴリの URL を取得し、ページを開く
- https://www.ranker.com/

ソースコード・ファイル

get_ranker_categories.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
import csv

# Open Browser
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') 
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

dictionary = "/Users/micksmith/home/work/eBay/Python/word_dictionary.csv"
url_exchange = "/Users/micksmith/home/work/eBay/Python/url_exchange.csv"

def get_categories():    
    categories_entire = []
    categories_entertainment = []
    categories_nerdy = []
    categories_channel = []
    
    items_entire = driver.find_elements_by_class_name('site__subItem')
    items_entertainment = driver.find_elements_by_class_name('-entertainment')
    items_nerdy = driver.find_elements_by_class_name('-nerdy')
    items_channel = driver.find_elements_by_class_name('-channels')
    #print(len(items))
    #items = set(items_entire) - set(items_channel)
    for item in items_entire:
        categories_entire.append(item.get_attribute("textContent"))
    
    for item in items_entertainment:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_entertainment.append(content.get_attribute("textContent"))
        
    for item in items_nerdy:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_nerdy.append(content.get_attribute("textContent"))

    for item in items_channel:
        #print(item.get_attribute("textContent"))
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_channel.append(content.get_attribute("textContent"))
    #print(categories_channel)
    #print(len(list(set(categories_entire) - set(categories_channel))))

    # print(categories_entire)
    # print(categories_entertainment)
    # print(categories_nerdy)
    # print(categories_channel)
   
    return categories_entire, categories_channel
    # return (list(set(categories_entire) - set(categories_channel)))
    # for item in items:
    #     categories = item.find_elements_by_class_name('site__subItem')
    #     for category in categories:
    #         print(category.get_attribute("textContent"))  
        
        # print(item.get_attribute("textContent"))
        # categories = item.find_elements_by_class_name('site__subItem')
        # for category in categories:
        #     print(category.text)

def exchange_words(word_dictionary):
    print("word_dictionary:", word_dictionary)
    word_before = []
    word_after = []
    word_results = []
    
    with open( dictionary, 'r') as f:
        reader = csv.reader(f)    
        for row in reader:
            word_before.append(row[0])
            word_after.append(row[1])
    
    for word in word_dictionary:
        for num in range(len(word_before)):
            if(word == word_before[num]):
                word = word_after[num]
                print(word)
        word_results.append(word)
    
    return word_results


if __name__ == "__main__":
    
    url = "https://www.ranker.com/"
    driver.get(url)
    entire, channel = get_categories()
    
    items = []
    list_of = []
    lists = []
    tags = []    
    
    # list_of = entertainment + nerdy
    # tags = list(set(item) - set(list_of))
    
    # list_of = exchange_words(list_of)
    # tags = exchange_words(tags)
    items = exchange_words(list(set(entire) - set(channel)))
    
    word_list_of = []
    word_lists = []
    
    with open(url_exchange, 'r') as f:
        reader = csv.reader(f)
        for row in reader:           
            word_list_of.append(row[0])
            word_lists.append(row[1])
    
    for item in items:
        for i in range(len(word_list_of)):
            if(item == word_list_of[i]):
                list_of.append(word_list_of[i])
            elif(item == word_lists[i]):
                lists.append(word_lists[i])
    
    tags = set(items) - set((list_of + lists))
    
    # print("list_of:", list_of)
    # print("lists:", lists)
    # print("tags:", tags)
    # # exchange 

    for item in list_of:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/list-of/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in lists:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/lists/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in tags:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/tags/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    # print(item)
    # print(list_of)
    # print(tags)
    # for category in categories:
    #     category = category.replace(" ","-")
    #     url = "https://www.ranker.com/list-of/" + category + "?ref=mainnav"
    #     #print("URL:", url)
    #     #driver.get(url)
    #     time.sleep(3)
    
    # df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
    # df.to_csv(Source_file, index=False)    
    driver.quit()

url_exchange.csv

"film","albums"
"tv","beverages"
"comics",""
"tech",""
"science",""
"cars",""
"arts",""
"books",""

word_dictionary.csv

"movies","film"
"celebrity","celebrities"
"watchworthy","what to watch"
"anime","anime underground"
"cartoons","animated"
"athletes","best athletes"
"family","parenting"
"career","jobs"
"automotive","cars"
"art","arts"
"deep thoughts","thought provoking"
"libations","alcohol"
"healthy eating","dieting"

結果

デモ
コマンド

URL: https://www.ranker.com/list-of/arts?ref=mainnav
URL: https://www.ranker.com/list-of/tech?ref=mainnav
URL: https://www.ranker.com/list-of/tv?ref=mainnav
…
URL: https://www.ranker.com/tags/college-sports?ref=mainnav

分析

各 URL の "hogehoge" に統一性が無いため、独自に分類して対応する( url_exchange.csv )
- https://www.ranker.com/hogehoge/category?ref=mainnav
  - list_of
  - lists
  - tags
各 URL の "category" について、取得した単語と異なる場合があるため、独自に変換表( word_dictionary.csv )を作成して対応する
- https://www.ranker.com/hogehoge/category?ref=mainnav

課題

変数名が識別しづらい
各関数の機能が適切かどうか

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up