More than 3 years have passed since last update.

webスクレイピングを学んだ記事を誤って削除してしまったので、コードの供養をします

Last updated at 2021-02-22Posted at 2021-02-22

つい４時間前に帰宅してから
ずっと記事を書いていたのに
誤って削除をしてしまいました...
ので、コードの供養がてら記事を掲載します...

ポケモンのバトルデータを掲載しているサイトから
ランキング形式で表示されている値をスクレイピングして出力するものです。

引数にポケモンの名前を与えてやると
図鑑番号を検索して、URLに図鑑番号が入るので
うまいことページが拾えます。

main.py

import sys, requests
import tkinter
from bs4 import BeautifulSoup
import os
import psycopg2
from psycopg2.extras import DictCursor

DATABASE_URL = XXXX
reply_message = ""
poke_name = sys.argv[1]

# DB接続
def get_connection():
    return psycopg2.connect(DATABASE_URL)

# ポケモン名の問い合わせを行い、完全一致・部分一致する
def get_response_name(poke_name):
    with get_connection() as conn:
        with conn.cursor(cursor_factory=DictCursor) as cur:
            cur.execute("SELECT name FROM poke_stat \
                WHERE name = '{0}' or name LIKE '{0}(%' or name LIKE 'メガ{0}%'".format(poke_name))
            rows = cur.fetchall()
            return rows

# 対象となるポケモンの図鑑番号を返す。
def get_response_no(poke_name):
    with get_connection() as conn:
        with conn.cursor(cursor_factory=DictCursor) as cur:
            cur.execute("SELECT no FROM poke_stat WHERE name = '{}'".format(poke_name))
            rows = cur.fetchall()
            return rows

# 引数からテーブルけんさくして、図鑑番号を返す。
def handle_message(poke_name):
    # 入力された名前に完全一致・部分一致するname要素を取得する。
    name_rows = get_response_name(poke_name)
    # テキストチェック
    if len(name_rows) == 0:
        return "そのようなポケモンは存在しません"
    else:
    # 取得したname要素のnoを返す。(とりあえず先頭のだけ)
        temp = get_response_no(*name_rows[0])
        url_no = ('{0:04d}'.format(*temp[0]))
        return url_no


# ---webスクレイピング ---

# urlを取得する
def get_url(poke_name):
    poke_no = handle_message(poke_name)
    url = f'https://swsh.pokedb.tokyo/pokemon/show/{poke_no}-00?season=15&rule=0'
    return url

# スクレイピング
def scraping_html(poke_name):
    url = get_url(poke_name)
    # print(url)
    html = requests.get(url)
    # print(html)

    data = BeautifulSoup(html.text, features='html.parser')
    test = ""
    test_2 = ""

    data.find('span', class_='tag is-type-9').clear()
    # print(data)


    # 小項目を抽出するには以下。
    i = ""
    for i in data.find_all('div', class_='level-item'):
        test += i.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in test.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    complete = '\n'.join(chunk for chunk in chunks if chunk)

    print(complete)
    # 末尾に改行入れる

    complete += '\n'

    # ↑この状態だと、1文字ずつ格納している
    # 改行コードも入っているので
    # 改行コードを見つけるまで文字列を配列に格納させる。
    # 改行コードじゃない場合、[0]に格納
    # 改行コードだったら、i++して、[0 + i]に格納

    mylist = []
    num = 0
    note = ""
    for k in complete:
        if k != '\n':
            note += k
            continue
        else:
            mylist.insert(num, note)
            num += 1
            note = ""

    # 1のインデックス場所を探す.
    l = 0
    index_L = []

    for l in range(0, len(mylist)):
        if mylist[l] == '1':
            index_L.append(l)
    print(index_L)

    # index_Lから数値を読み取り、mylist[]にappendしていく。
    waza_flag = False
    tokusei_flag = False
    seikaku_flag = False
    motimono_flag = False
    teams_flag = False

    for m in range(0, len(index_L)):
        append_idx = index_L[m]
        if not waza_flag:
            mylist.insert(append_idx + m, "わざ")
            waza_flag = True
            continue
        if not tokusei_flag:
            mylist.insert(append_idx + m, "とくせい")
            tokusei_flag = True
            continue
        if not seikaku_flag:
            mylist.insert(append_idx + m, "せいかく")
            seikaku_flag = True
            continue
        if not motimono_flag:
            mylist.insert(append_idx + m, "もちもの")
            motimono_flag = True
            continue
        if not teams_flag:
            mylist.insert(append_idx + m, "同じチーム")
            teams_flag = True
            continue

    text = ""
    for n in range(0, len(mylist)):
        text += mylist[n] + "{0}".format('\n')

    print(text)

# 実行
scraping_html(poke_name)

参考にさせて頂いたサイト

コーディング
https://qiita.com/Chanmoro/items/db51658b073acddea4ac
削除
https://water2litter.net/rum/post/python_bs4_clear/
select,findの概念
https://gammasoft.jp/blog/difference-find-and-select-in-beautiful-soup-of-python/
文字だけ切り取る
https://qiita.com/poorko/items/9140c75415d748633a10
https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python

追記
削除しちゃった記事のバックアップってどっかに転がってないのか....調べても同じ事で困ってる人は出てこなかった

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up