Python
python3
BeautifulSoup

BeautifulSoupでノーベル賞受賞者の国籍を取得する(PJDV 5.7.3章)

概要

PJDV 5.8章p133

次の章では、高性能なスクレイピングライブラリScrapyを使って本書の可視化に必要なすべてのノーベル章データを取得します。

サンプルコード

pjdv_s5_7_3.py
# -*- coding: utf-8 -*-
import pjdv_s5_7_s5_7_1 as mylib1
import re

def get_nobel_winner_nationality(nobel_winner):
    """ 受賞者のWikipediaページから国籍を取得する。取得できない場合はNoneを返す """
    BASE = 'https://en.wikipedia.org'

    soup = mylib1.get_soup(mylib1.join_url(BASE, nobel_winner["link"]))
    nationality = None
    for tr in soup.select(".infobox tr"):
        try:
            th_text = tr.select_one("th").text
            if th_text == "Nationality":
                nationality = re.sub(r'\[.+?\]', "", tr.select_one("td").text)
                nationality = re.sub(r'\n', ",", nationality)
                break
        except AttributeError:
            pass
    return nationality

def main():
    nobel_winners = mylib1.get_nobel_laureates()
    num = 20
    print("start get_nobel_winner_nationality ({0} of {1})".format(num, len(nobel_winners)))
    for w in nobel_winners[:num]:
        print(".", end="")
        nationality = get_nobel_winner_nationality(w)
        w["nationality"] = nationality
    print("done")
    dat = [x for x in nobel_winners if w["nationality"]]
    print(dat[0:num])

if __name__ == '__main__':
    main()

tips

In [2]: %load_ext autoreload

In [3]: %autoreload 2

In [4]: import pjdv_s5_7_s5_7_1 as mylib1

In [5]: import pjdv_s5_7_3 as mylib2