More than 1 year has passed since last update.

Pythonでブログのコメント自動収集

Posted at 2023-06-17

はじめに

Pythonによるブログのコメント収集の自動化をしました。
以前の記事で紹介した本文抽出との違いは、
アメブロのコメントがJavaScriptで動的に作成されるコンテンツであり、
htmlで読み込んでもデータを取得できないことです。

そこで、Seleniumというツールを使って、ブラウザを制御してWebページの操作をしながら情報を収集しました。

コード

必要なライブラリの読み込み

#ファイル操作
import os
import pandas as pd
import shutil
import datetime
from itertools import zip_longest

#パース解析
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time

#JavaScript対応
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

ブログのURLの取得

全記事から取得するパターン

これまでの記事でも使っています。
https://qiita.com/yaanai/items/9cc6b44b4c22850aa3bb

#URLの取得
page = 1
cnt=1
next_flg = True


#URLを格納するリスト
urls = []


while next_flg:
    list_url = 'https://ameblo.jp/pero0921/entrylist-' + str(page) + '.html'
    response = requests.get(list_url)
    html= response.text
    soup = BeautifulSoup(html, "html.parser")

    h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'})
    
    for h2 in h2s:
        url = h2.find('a').get('href')
        urls.append("https://ameblo.jp"+url)
        print(url)
        
    time.sleep(3)    
    #次ページが存在するか確認
    page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class'))
    if page_class != 3:
        next_flg = False
        break

    page += 1

収集したい記事のURLのリストから選ぶパターン

コピーして、pd.read_clipboard()で読み込んでリストへ渡すこともできる。

urls=pd.read_clipboard()
urls=urls.values.tolist()
urls=urls[0]

Seleniumの起動

# ブラウザを起動する
driver = webdriver.Edge(executable_path = r"your/folder/path/of/webdriver")

#データを格納するリスト
data=[]

コメント用の正規表現

#コメント取得用の正規表現
comment_regex=re.compile(r'<div data-uranus-component="commentText">(.*?)</div>')
commenter_regex=re.compile(r'target="_blank">(.*?)</a>')

URLごとに処理

#各URLで処理
for url in urls:

WebDriverWait：ページの読み込み(最大5s)

    try:
        driver.get(url)
        WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located)

JavaScriptの読み込み

ページをスクロールしてJavaScriptを実行させています。

        #スクロールしてコメントを読みこむ
        driver.maximize_window()
        # ページの高さを取得
        scroll_height = driver.execute_script("return document.documentElement.scrollHeight")

        # スクロール間隔と待機時間を設定
        scroll_interval = 500  # ページをスクロールするピクセル数
        wait_time = 0.5  # スクロール間隔の待機時間（秒）

        # スクロール処理
        while scroll_height > 0:
            # ページをスクロール
            driver.execute_script(f"window.scrollTo(0, {scroll_height});")

            # 待機時間
            time.sleep(wait_time)

            # ページの高さを更新
            scroll_height -= scroll_interval

htmlの解析

        # HTMLを文字コードをUTF-8に変換してから取得します。
        html = driver.page_source.encode('utf-8')
        # BeautifulSoupで扱えるようにパースします
        soup = BeautifulSoup(html, "html.parser")
        
        Date = soup.find("time",class_="skin-textQuiet").text
        Title = soup.find("a",class_="skinArticleTitle").text
        print(Title+"取り込み中")
        
        #コメント取得
        comment_source=soup.find("ul",class_="is-collapsed")
        comment_list=comment_source.find_all("li")
        Comments_list=re.findall(comment_regex,', '.join(map(str, comment_list)).replace("<br/>","").replace("<br>",""))
        Commenters_list=re.findall(commenter_regex,', '.join(map(str, comment_list)).replace("<br/>","").replace("<br>",""))
        
        for comment, commenter in zip_longest(Comments_list, Commenters_list):
            data.append([Date,Title,comment,commenter])
            
    except:
        print(str(url)+" Error")

Excelへの書き込み

# データをExcelファイルに書き込む
df = pd.DataFrame(data, columns=['Date','Title',"comments","commenter"])
df.to_excel('blog_data_comment.xlsx', index=False)

全体のコード


#ファイル操作
import os
import pandas as pd
import shutil
import datetime
from itertools import zip_longest

#パース解析
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time

#JavaScript対応
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

#ファイルの格納先
os.chdir(r"D:\Pydoc\MyPythonScripts\webScrapying\ブログ解析")

#コメント取得用の正規表現
comment_regex=re.compile(r'<div data-uranus-component="commentText">(.*?)</div>')
commenter_regex=re.compile(r'target="_blank">(.*?)</a>')

#URLの取得
page = 1
cnt=1
next_flg = True


#URLを格納するリスト
urls = []


while next_flg:
    list_url = 'https://ameblo.jp/pero0921/entrylist-' + str(page) + '.html'
    response = requests.get(list_url)
    html= response.text
    soup = BeautifulSoup(html, "html.parser")

    h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'})
    
    for h2 in h2s:
        url = h2.find('a').get('href')
        urls.append("https://ameblo.jp"+url)
        print(url)
        
    time.sleep(3)    
    #次ページが存在するか確認
    page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class'))
    if page_class != 3:
        next_flg = False
        break

    page += 1


# ブラウザを起動する
driver = webdriver.Edge(executable_path = r"your/folder/path/of/webdriver")

#データを格納するリスト
data=[]

#各URLで処理
for url in urls:
    try:
        driver.get(url)
        WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located)    
        #スクロールしてコメントを読みこむ
        driver.maximize_window()
        # ページの高さを取得
        scroll_height = driver.execute_script("return document.documentElement.scrollHeight")

        # スクロール間隔と待機時間を設定
        scroll_interval = 500  # ページをスクロールするピクセル数
        wait_time = 0.5  # スクロール間隔の待機時間（秒）

        # スクロール処理
        while scroll_height > 0:
            # ページをスクロール
            driver.execute_script(f"window.scrollTo(0, {scroll_height});")

            # 待機時間
            time.sleep(wait_time)

            # ページの高さを更新
            scroll_height -= scroll_interval


        
        # HTMLを文字コードをUTF-8に変換してから取得します。
        html = driver.page_source.encode('utf-8')
        # BeautifulSoupで扱えるようにパースします
        soup = BeautifulSoup(html, "html.parser")
        
        Date = soup.find("time",class_="skin-textQuiet").text
        Title = soup.find("a",class_="skinArticleTitle").text
        print(Title+"取り込み中")
        
        #コメント取得
        comment_source=soup.find("ul",class_="is-collapsed")
        comment_list=comment_source.find_all("li")
        Comments_list=re.findall(comment_regex,', '.join(map(str, comment_list)).replace("<br/>","").replace("<br>",""))
        Commenters_list=re.findall(commenter_regex,', '.join(map(str, comment_list)).replace("<br/>","").replace("<br>",""))
        
        for comment, commenter in zip_longest(Comments_list, Commenters_list):
            data.append([Date,Title,comment,commenter])
            
    except:
        print(str(url)+" Error")

# ブラウザを終了する
driver.quit()
        
# データをExcelファイルに書き込む
df = pd.DataFrame(data, columns=['Date','Title',"comments","commenter"])
df.to_excel('blog_data_comment.xlsx', index=False)

実行画面

ページが次々と読み込まれていきます。
進捗が分かるように、取り込み中の記事を表示するようにしています。
1記事あたり30秒だとして、500記事くらいだと、４時間くらい

実行結果

記事が作成された時刻、記事のタイトル、コメントとコメントした人の情報がExcelに出力されています。

Excel関数でlen()でコメントの文字数取得していましたが、以下のコードを追加してもいいですね。

# commentsの文字数の列を追加
df['Comments Length'] = df['comments'].apply(lambda x: len(x))

解析結果

Power BIで解析した結果
WordCloudで集計した結果
BarChartRaceで可視化した結果

代表的なブログのコメントの可視化方法

ChatGPTに聞いてみたら、結構的を射た答えが返ってきました。

コメントのトレンド: 折れ線グラフやエリアチャートを作成して、時間の経過に伴うコメント数のトレンドを視覚化します。人気のあるブログ投稿やエンゲージメントが高い期間を特定するのに役立ちます。
コメントの感情分析: Power BIの組み込み機能または外部の感情分析ツールを活用して、コメントの感情分析を実施します。積み上げ棒グラフやドーナツチャートを使用して、感情の分布を視覚化します。
トップコメント投稿者: コメント数に基づいてトップの貢献者を示すテーブルや棒グラフを作成して、最も活動的なコメント投稿者を特定します。
ワードクラウド: コメント内で最も頻繁に使用される単語をハイライトするワードクラウドの可視化を生成します。ブログで議論されている主要なトピックやテーマを把握するのに役立ちます。
コメントの長さの分布: ヒストグラムやボックスプロットを使用して、コメントの長さの分布を分析します。コメントの長さのパターンを特定し、外れ値を見つけるのに役立ちます。

感情分析は技術的に興味があるけど、リテラシーが問われる分野ですね。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up