pythonを用いてGoogle Scholarの論文タイトルを取得する方法

Last updated at 2025-03-02Posted at 2024-12-08

概要

pythonのtkinter上でGoogle Scholarの論文タイトルを検索して
論文のURLを表示するコードを書いてみました。
まずはAIチャットアプリを参考に土台を作っていきます。
なお、メモ程度に書いてますのでご了承ください。

実行環境

・windows：Windows11Pro 23H2
・python：3.12.3

GUIデザイン

1)ジャンルと検索キーワード、ページするや発行年等を入力して検索ボタンを押下すると、
一つの論文につき以下の情報が表示されます。
・Original:論文のタイトル
・Translation:論文の日本語訳タイトル
・Citations:被引用数
・Authors:論文の著者（3名まで表示）
・Link:原著論文のURL

2)過去の履歴を閲覧することができます。
　過去の履歴はhistory.txtにタイムスタンプやタイトル、リンクが出力されます。

実行方法

py .\test.py

ソースコード

検索ジャンルは以下のjsonファイルに記載しています。

keywords.json

{
    "keywords": {
        "AI": ["機械学習", "人工知能", "生成AI", "マルチモーダルAI", "モデル最適化", "TinyML", "ディープフェイク技術", "AI倫理", "ヘルスケアにおけるAI", "気候変動におけるAI"],
        "Data Science": ["データサイエンス", "ビッグデータ", "生成AI", "ディープフェイク技術", "TinyML", "データプライバシー", "予測分析", "データビジュアライゼーション", "AI駆動の意思決定", "リアルタイムデータ処理"],
        "Biology": ["ゲノミクス", "CRISPR遺伝子編集", "合成生物学", "神経科学", "バイオテクノロジー", "エピジェネティクス", "分子生物学", "バイオインフォマティクス", "免疫療法", "細胞生物学"],
        "Physics": ["量子コンピューティング", "ダークマター", "ニュートリノ物理学", "重力波", "素粒子物理学", "弦理論", "宇宙論", "天体物理学", "超伝導", "量子もつれ"],
        "Chemistry": ["ナノテクノロジー", "グリーンケミストリー", "生化学", "材料科学", "製薬", "触媒", "有機化学", "無機化学", "高分子科学", "化学工学"],
        "Earth Science": ["気候変動", "地熱エネルギー", "自然災害", "海洋学", "プレートテクトニクス", "土壌科学", "水文学", "環境地質学", "鉱物資源", "地震予測"],
        "Astronomy": ["系外惑星", "ブラックホール", "宇宙マイクロ波背景放射", "ダークエネルギー", "星の進化", "銀河形成", "超新星", "中性子星", "重力レンズ効果", "宇宙論"],
        "Environmental Science": ["持続可能性", "保全生物学", "汚染制御", "生態系サービス", "気候変動", "生物多様性", "環境政策", "再生可能エネルギー", "廃棄物管理", "環境影響評価"],
        "Geology": ["火山学", "古生物学", "鉱物学", "地震予測", "地熱エネルギー", "堆積学", "構造地質学", "地形学", "岩石学", "テクトニクス"],
        "Oceanography": ["海洋生物学", "海洋酸性化", "深海探査", "気候変動", "海洋生態系", "海流", "生物地球化学的循環", "漁業科学", "沿岸プロセス", "熱水噴出口"],
        "Meteorology": ["気候モデリング", "天気予報", "大気化学", "竜巻", "ハリケーン", "気候変動", "空気品質", "降水パターン", "嵐の追跡", "気候変動"],
        "Materials Science": ["ナノ材料", "バイオマテリアル", "複合材料", "スマートマテリアル", "エネルギー貯蔵", "ウェアラブル技術", "自己修復材料", "3Dプリンティング", "メタマテリアル", "先端セラミックス"]
    }
}

検索履歴はhistory.txtに出力されます。

history.txt

2024-12-14 14:33:20	[PDF][PDF] PROPOSAL DETAILS	https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=6cf35ec34efa592f83e3a1b748aea14957fc784a
2024-12-14 14:35:36	Better liked than right: Trustworthiness and expertise as factors in credibility	https://www.osti.gov/etdeweb/biblio/8497791
2024-12-14 14:39:20	Efficient visible light nitrogen fixation with BiOBr nanosheets of oxygen vacancies on the exposed {001} facets	https://arxiv.org/abs/1502.05698
2024-12-14 14:41:21	The number N of galactic civilizations must be either very large or very small	https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/JA085iA11p05909
2024-12-14 14:48:48	[HTML][HTML] Nanotechnology: the future medicine	https://journals.sagepub.com/doi/abs/10.1177/0960327115603588
2024-12-14 14:57:36	[PDF][PDF] PROPOSAL DETAILS	https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=61a4ba18fbcb8be873b18a4a68e0b0167d90c32a

test.py

import tkinter as tk
from tkinter import scrolledtext, Spinbox, StringVar, OptionMenu
import requests
from bs4 import BeautifulSoup
from googletrans import Translator
import webbrowser
import json
from datetime import datetime

# JSONファイルから検索キーワードを読み込む
def load_keywords_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data.get('keywords', {})

# 翻訳を実行
def translate_text(text):
    translated = translator.translate(text, src='ja', dest='en')
    return translated.text

def search_google_scholar(query, num_pages, start_year, end_year):
    base_url = 'https://scholar.google.com/scholar'
    params = {
        'q': query,
        'hl': 'en',
        'as_ylo': start_year,
        'as_yhi': end_year
    }
    results = []

    for page in range(num_pages):
        params['start'] = page * 10
        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')

        for result in soup.find_all('div', class_='gs_ri'):
            title = result.find('h3', class_='gs_rt').text
            link = result.find('a')['href']
            translated_title = translate_text(title)
            details = result.find('div', class_='gs_a').text
            authors, rest = details.split(' - ', 1)
            authors_list = authors.split(',')[:3]  # 最初の3人の著者を取得
            citation = result.find('div', class_='gs_fl').find_all('a')[2].text
            results.append((title, translated_title, link, citation, authors_list))
    
    return results

def on_search():
    keyword_jp = keyword_spinbox.get()
    query = translate_text(keyword_jp)
    num_pages = int(pages_spinbox.get())
    start_year = int(start_year_spinbox.get())
    end_year = int(end_year_spinbox.get())
    results = search_google_scholar(query, num_pages, start_year, end_year)
    
    text_area.config(state=tk.NORMAL)
    text_area.delete(1.0, tk.END)
    
    for original_title, translated_title, link, citation, authors in results:
        authors_str = ", ".join(authors)
        text_area.insert(tk.END, f'Original: {original_title}\nTranslation: {translated_title}\nCitations: {citation}\nAuthors: {authors_str}\n')
        
        # ハイパーリンクを追加
        text_area.insert(tk.END, "Link", link)
        text_area.insert(tk.END, f': {link}\n\n')
        text_area.insert(tk.END, '-'*80 + '\n')  # 線引き追加
        text_area.tag_config(link, foreground="blue", underline=True)
        text_area.tag_bind(link, "<Button-1>", lambda e, url=link: (webbrowser.open_new(url), save_history(original_title, url)))
    
    text_area.config(state=tk.DISABLED)

def save_history(title, url):
    with open("history.txt", "a", encoding='utf-8') as file:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        file.write(f"{timestamp}\t{title}\t{url}\n")

def show_history():
    try:
        with open("history.txt", "r", encoding='utf-8') as file:
            history_data = file.readlines()
    except FileNotFoundError:
        history_data = []

    history_window = tk.Toplevel(root)
    history_window.title("検索履歴")
    history_text = scrolledtext.ScrolledText(history_window, wrap=tk.WORD, width=120, height=30, font=font)
    history_text.grid(row=0, column=0, padx=10, pady=10)
    
    for entry in history_data:
        parts = entry.split('\t')
        if len(parts) == 3:
            timestamp, title, url = parts
            history_text.insert(tk.END, f'{timestamp}\t{title}\t', 'normal')
            history_text.insert(tk.END, f'{url}\n', (url, 'hyperlink'))

            history_text.tag_config(url, foreground="blue", underline=True)
            history_text.tag_bind(url, "<Button-1>", lambda e, url=url: webbrowser.open_new(url))
        else:
            history_text.insert(tk.END, entry)

def update_keywords(*args):
    genre = genre_var.get()
    keywords = keywords_by_genre.get(genre, [])
    keyword_var.set(keywords[0] if keywords else "")
    keyword_spinbox["values"] = keywords

def open_google_scholar():
    webbrowser.open("https://scholar.google.com/")

# 翻訳器のインスタンスを作成
translator = Translator()

# GUIの設定
root = tk.Tk()
root.title("Google Scholar Scraper")

# ウィンドウサイズを1.5倍に設定
root.geometry("1200x600")  # width x height

# カスタムフォントとスタイル
font = ("Helvetica", 12)
bg_color = "#f0f0f0"
button_color = "#4caf50"
button_font = ("Helvetica", 12, "bold")

root.configure(bg=bg_color)

# JSONファイルからキーワードを読み込む
keywords_by_genre = load_keywords_from_json('keywords.json')
genres = list(keywords_by_genre.keys())

# ジャンル選択（オプションメニュー）
tk.Label(root, text="ジャンル:", font=font, bg=bg_color).grid(row=0, column=0, padx=10, pady=10, sticky="e")
genre_var = StringVar(root)
genre_var.set(genres[0])  # 初期値として最初のジャンルを設定
genre_menu = OptionMenu(root, genre_var, *genres, command=update_keywords)
genre_menu.config(font=font)
genre_menu.grid(row=0, column=1, padx=10, pady=10, sticky="w")

# キーワード選択（スピンボックス）
tk.Label(root, text="検索キーワード:", font=font, bg=bg_color).grid(row=1, column=0, padx=10, pady=10, sticky="e")
keyword_var = StringVar(root)
keywords = keywords_by_genre[genres[0]]
keyword_var.set(keywords[0])
keyword_spinbox = Spinbox(root, values=keywords, textvariable=keyword_var, font=font)
keyword_spinbox.grid(row=1, column=1, padx=10, pady=10, sticky="w")

# ページ数入力（スピンボックス）
tk.Label(root, text="ページ数:", font=font, bg=bg_color).grid(row=2, column=0, padx=10, pady=10, sticky="e")
pages_spinbox = Spinbox(root, from_=1, to=3, font=font)
pages_spinbox.grid(row=2, column=1, padx=10, pady=10, sticky="w")

# 発行年入力（スピンボックス）
tk.Label(root, text="発行年（開始）:", font=font, bg=bg_color).grid(row=3, column=0, padx=10, pady=10, sticky="e")
start_year_spinbox = Spinbox(root, from_=1980, to=2024, font=font)
start_year_spinbox.grid(row=3, column=1, padx=10, pady=10, sticky="w")

tk.Label(root, text="発行年（終了）:", font=font, bg=bg_color).grid(row=4, column=0, padx=10, pady=10, sticky="e")
end_year_spinbox = Spinbox(root, from_=1980, to=2024, font=font)
end_year_spinbox.grid(row=4, column=1, padx=10, pady=10, sticky="w")

# 検索ボタン
search_button = tk.Button(root, text="検索", command=on_search, bg=button_color, fg="white", font=button_font)
search_button.grid(row=5, column=0, columnspan=2, pady=10)

# Google Scholarリンクボタン
gs_link_button = tk.Button(root, text="Open Google Scholar", command=open_google_scholar, bg=button_color, fg="white", font=button_font)
gs_link_button.grid(row=6, column=0, columnspan=2, pady=10)

# 履歴表示ボタン
history_button = tk.Button(root, text="履歴を見る", command=show_history, bg=button_color, fg="white", font=button_font)
history_button.grid(row=7, column=0, columnspan=2, pady=10)

# 結果表示エリア
text_area = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=120, height=30, font=font)
text_area.grid(row=8, column=0, columnspan=2, padx=10, pady=10)
text_area.config(state=tk.DISABLED)

# メインループの開始
root.mainloop()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up