LoginSignup
4
5

More than 5 years have passed since last update.

LeetCodeの問題と模範解答をScrapingしてJupyter Notebookにまとめてみた(模範解答 Toggle Button付き)

Last updated at Posted at 2018-10-08

はじめに

最近、プログラミング力強化&面接対策として、LeetCodeの問題を解き始めた。模範解答としてこのRepositoryを利用させて頂いている。

Jupyter Notebook上で問題を解いていたのだが、LeetCode、模範解答Repository、Jupyter Notebook、この3つをブラウザ上で何度も行き来するのが非常に面倒だと感じ、以下の発想に至った。

問題と模範解答をScrapingして、Jupyter Notebookにまとめて、更に模範解答のCellをToggleできるボタンを配置すれば、作業が捗るのではないか?

結果

まず結果からお見せします。最終的にこんな感じのNotebookができあがりました。これで「問題の理解 → 解く → 解答確認」のサイクルがスムーズに行える!
demo.gif

Let's 実装

1. Scraping

LeetCodeseleniumで、模範解答RepositoryrequestsでScrapingした。

2. Jupyter Notebookの動的編集

nbformatというライブラリを使えば可能(参考記事)。以下のサンプルコードのようなイメージで、Scrapingした問題と模範解答を、markdown_cellcode_cellに挿入していく。

create_notebook.py
import nbformat as nbf

nb = nbf.v4.new_notebook()

for problem, solution in zip(problems, solutions):
    nb["cells"].append(nbf.v4.new_markdown_cell(problem))
    nb["cells"].append(nbf.v4.new_code_cell(solution))
    nb["cells"].append(nbf.v4.new_code_cell("# Your soluiton"))

with open("output.ipynb", "w") as f:
    nbf.write(nb, f)

3. CellのToggle

Stackoverflowに既に回答が投稿されていた。
https://stackoverflow.com/a/52664156/6943581

「CellのToggleを行うリンク(ボタンに変更可)」を返す関数を定義しておいて、ToggleさせたいCellでその関数を呼び出せば、Toggle機能を実現できる。

48UMz.gif

from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'
    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)
    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'
    js_f_name = f'code_toggle_{random.randint(1,2**64)}'
    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}
            {js_hide_current}
        </script>
        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

コード

全部まとめると以下のようなコードになります。少し長いですが載せておきます。

import re
import csv
import pandas as pd
import sys
import os
import nbformat
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options


driver = webdriver.Chrome()


def print_error(e):
    exc_type, exc_obj, exc_tb = sys.exc_info()
    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
    print(exc_type, fname, exc_tb.tb_lineno, e)


def create_new_notebook(fpath):
    nb = nbformat.read('base.ipynb', 4)
    with open(fpath, 'w') as f:
        nbformat.write(nb, f)


def update_notebook(fpath, problem, sol):
    nb = nbformat.read(fpath, 4)
    nb['cells'].insert(-1, nbformat.v4.new_markdown_cell(problem))
    nb['cells'].insert(-1, nbformat.v4.new_code_cell(sol + '\n\nhide_toggle("Toggle the solution")'))
    nb['cells'].insert(-1, nbformat.v4.new_code_cell('# Your solution'))
    with open(fpath, 'w') as f:
        nbformat.write(nb, f)


def scrape_problems():
    top_url = 'https://leetcode.com'
    driver.get(top_url + '/problemset/all')
    WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'select.form-control')))
    Select(driver.find_element_by_css_selector('select.form-control')).select_by_visible_text('all')
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table = soup.find('div', {'class': 'question-list-table'})
    header = [x.text.lower() for x in table.select('thead th')[1:-1]] + ['problem_url', 'locked']
    title_idx = header.index('title')
    solution_idx = header.index('solution')

    rows = []
    with open('prolems.csv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(header)
        for tr in table.select('tbody tr')[:-1]:
            row = []
            for i, td in enumerate(tr.find_all('td')[1:-1]):
                if i == solution_idx:
                    if td.find_all('a'):
                        row.append(top_url + td.find('a')['href'])
                    else:
                        row.append('')
                else:
                    row.append(td.text.strip())
                    if i == title_idx:
                        url = top_url + td.find('a')['href']
                        if not url.endswith('/description'): url += '/description'
                        locked = len(td.find_all('i', {'class': 'fa-lock'}))
            row.extend([url, locked])
            rows.append(row)
            writer.writerow(row)
    return pd.DataFrame(rows, columns=header)


def scrape_description(url):
    driver.get(url)
    try:
        class_regex = r'^question-description'
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'question-content')))
    except Exception as e:
        class_regex = r'content-wrapper'
    soup = BeautifulSoup(driver.page_source, 'lxml')
    desc = soup.find('div', {'class': re.compile(class_regex)}).find('div').decode_contents()
    return desc


def format_description(desc):
    return desc.replace('<pre>', '<div class="example">').replace('</pre>', '</div>').replace('\n</div>', '</div>')


def scrape_solution(title='two-sum'):
    url = 'https://raw.githubusercontent.com/kamyu104/LeetCode/master/Python/{}.py'.format(title)
    try:
        r = requests.get(url)
        return r.text if r.status_code == 200 else ''
    except requests.exceptions.RequestException as e:
        print_error(e)
        return ''


def format_solution(sol):
    regex = r'(?m)^# *((?!(Time:|Space:)).)*$\n?'
    return re.sub(regex, '', sol).replace('from __future__ import print_function\n', '')


def main():
    difficulty = 'Easy'
    fpath = 'leetcode_{}.ipynb'.format(difficulty.lower())
    create_new_notebook(fpath)
    try:
        df = scrape_problems()
        df = df[df['difficulty'] == difficulty]
        cols = ['title', 'problem_url', 'difficulty', 'locked']
        for index, (title, problem_url, difficulty, locked) in df[cols].iterrows():
            # if index + 1 > 100: break
            if locked: continue
            desc = scrape_description(problem_url)
            desc = format_description(desc)
            sol = scrape_solution(title.lower().replace(' ', '-'))
            sol = format_solution(sol)
            problem = '\n'.join([
                '---\n'
                '## [{}. {} ({})]({})'.format(index + 1, title, difficulty.capitalize(), problem_url),
                desc,
            ])
            update_notebook(fpath, problem, sol)
            print(index, title, problem_url)
    except Exception as e:
        print_error(e)
    finally:
        driver.quit()


if __name__ == '__main__':
    main()

得られた知見

  • Jupyter Notebookを動的に編集する方法:非常にお手軽で驚いた。是非、有効活用していきたい。
  • CellのToggle機能:教育資料等を作る際に役立ちそう。

最後に

GitHubにあげたので、もし興味がありましたら、遊んでみてください!
https://github.com/harupy/leetcode-nb

4
5
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
5