Pythonでなろう系小説をテキストで自動ダウンロードしてみた

Posted at 2024-10-04

~~会社で暇だったので作った。~~

めんどくさそうだなと思ったけど、BealtifulSoup4っていうのを使ったら割と簡単に実装できた。

narou_downloader_bs.py

import os
import requests
from bs4 import BeautifulSoup

BASE_URL = 'https://ncode.syosetu.com'
# ↓ここを変えてね
DONWLOAD_URL = 'https://ncode.syosetu.com/n8440fe/'


def fetch_url(url):
    ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    headers = {'User-Agent': ua}
    return requests.get(url,headers=headers)

# ---------------- main ----------------
url = DONWLOAD_URL
sublist = []

while(True):
    res = fetch_url(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    # タイトル
    title_text = soup.find('title').get_text()

    # 各章のタグ
    sublist = sublist + soup.select('.p-eplist__sublist .p-eplist__subtitle')

    # Note:小説が何話も続いていると「次へ」が表示される
    next = soup.select_one('.c-pager__item--next')
    
    if next != None:
        link = next.get('href')
        if link != None:
            # 「次へ」
            url = f'{BASE_URL}{link}'
        else:
            break
    else:
        break

# 小説データの保存フォルダ生成
try:
    os.makedirs(f'./{title_text}')
except FileExistsError:
    pass

# テキスト取得
sub_len = len(sublist)

for i in range(sub_len):
    sub = sublist[i]
    sub_title = sub.text.replace('\n', '')
    link = sub.get('href')
    
    res = fetch_url(f'{BASE_URL}{link}')
    soup = BeautifulSoup(res.text, 'html.parser')
    sub_body_text = soup.select_one('.p-novel__body').text
    
    # テキスト出力
    f = open(f'./{title_text}/{sub_title}.txt', 'w', encoding='UTF-8')
    f.write(sub_body_text)
    f.close()
    
    print(f'{sub_title} ({i+1}/{sub_len})')

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up