PythonとSeleniumで始めるWebページ差分検出：基本実装から実践的な応用まで

Posted at 2024-11-17

はじめに

Webサイトの更新を自動的に検出したい、UIの変更を監視したい要件をかなえるため、PythonとSeleniumを使ったWebページの差分検出システムを検討してみました。

この記事では、基本的な実装から始めて、より複雑なWebサイトにも対応できる発展的な実装まで段階的に解説していきます。

必要な環境

まずは必要なライブラリをインストールしましょう：

pip install selenium
pip install webdriver-manager
pip install opencv-python
pip install numpy

基本実装：シンプルなWebページ差分検出

最も基本的な実装から見ていきましょう。この実装では：

スクリーンショットの取得
画像の比較
差分の可視化
を行います。

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import cv2
import numpy as np
from datetime import datetime
import os
import time

class WebPageDiffDetector:
    def __init__(self, output_dir="diff_results"):
        self.output_dir = output_dir
        self._create_output_dirs()

    def _create_output_dirs(self):
        """出力ディレクトリの作成"""
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, "screenshots"), exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, "diffs"), exist_ok=True)

    def setup_driver(self):
        """ChromeDriverのセットアップ"""
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--window-size=1920,1080')
        service = Service(ChromeDriverManager().install())
        return webdriver.Chrome(service=service, options=options)

    def capture_screenshot(self, url):
        """スクリーンショットの取得"""
        driver = self.setup_driver()
        try:
            driver.get(url)
            time.sleep(1)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filepath = os.path.join(
                self.output_dir, 
                "screenshots", 
                f"screenshot_{timestamp}.png"
            )
            driver.save_screenshot(filepath)
            return filepath
        finally:
            driver.quit()

    def compare_images(self, image1_path, image2_path):
        """画像の比較と差分検出"""
        img1 = cv2.imread(image1_path)
        img2 = cv2.imread(image2_path)

        # サイズの統一
        if img1.shape != img2.shape:
            height = min(img1.shape[0], img2.shape[0])
            width = min(img1.shape[1], img2.shape[1])
            img1 = cv2.resize(img1, (width, height))
            img2 = cv2.resize(img2, (width, height))

        # グレースケール変換と差分検出
        gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
        gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
        diff = cv2.absdiff(gray1, gray2)
        _, thresh = cv2.threshold(diff, 30, 255, cv2.THRESH_BINARY)

        # 差分箇所の強調表示
        contours, _ = cv2.findContours(
            thresh, 
            cv2.RETR_EXTERNAL, 
            cv2.CHAIN_APPROX_SIMPLE
        )
        
        result = img2.copy()
        cv2.drawContours(result, contours, -1, (0, 0, 255), 2)

        # 差分の統計を計算
        diff_areas = []
        for contour in contours:
            area = cv2.contourArea(contour)
            if area > 100:  # ノイズ除去
                diff_areas.append(area)

        return result, len(diff_areas), sum(diff_areas)

基本実装の使用例

def test_basic_diff():
    """
    基本的な差分検出のテスト
    """
    detector = WebPageDiffDetector()
    
    # テスト用の2つの異なるHTMLファイルを作成
    html1 = """
    <html>
        <head>
            <meta charset="utf-8">
            <style>
                body { font-family: Arial, sans-serif; padding: 20px; }
                .box { width: 200px; height: 200px; background: blue; margin: 20px; }
            </style>
        </head>
        <body>
            <h1>テストページ 1</h1>
            <div class="box"></div>
            <p>これは変更されない段落です。</p>
            <p id="change">この段落は変更されます。</p>
        </body>
    </html>
    """
    
    html2 = """
    <html>
        <head>
            <meta charset="utf-8">
            <style>
                body { font-family: Arial, sans-serif; padding: 20px; }
                .box { width: 200px; height: 200px; background: red; margin: 20px; }
            </style>
        </head>
        <body>
            <h1>テストページ 1</h1>
            <div class="box"></div>
            <p>これは変更されない段落です。</p>
            <p id="change">この段落は変更されました！</p>
        </body>
    </html>
    """
    
    # HTMLファイルを一時的に保存
    with open("test1.html", "w", encoding='utf-8') as f:
        f.write(html1)
    with open("test2.html", "w", encoding='utf-8') as f:
        f.write(html2)
    
    try:
        # 現在の作業ディレクトリの絶対パスを取得
        current_dir = os.path.abspath(os.getcwd())
        file1_url = f"file:///{current_dir}/test1.html".replace("\\", "/")
        file2_url = f"file:///{current_dir}/test2.html".replace("\\", "/")
        
        print("\nテスト開始:")
        print("1. スクリーンショット取得中...")
        screenshot1 = detector.capture_screenshot(file1_url)
        time.sleep(1)
        screenshot2 = detector.capture_screenshot(file2_url)
        
        print("2. 差分検出実行中...")
        result_img, diff_count, diff_area = detector.compare_images(screenshot1, screenshot2)
        
        # 結果の保存
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        result_path = os.path.join(detector.output_dir, "diffs", f"diff_{timestamp}.png")
        cv2.imwrite(result_path, result_img)
        
        print("\n検出結果:")
        print(f"- 検出された差分の数: {diff_count}")
        print(f"- 差分の合計面積: {diff_area:.1f} ピクセル")
        print(f"- 結果画像: {result_path}")
        
        # 差分の大きさに基づく評価
        if diff_area > 50000:
            print("※ 大きな変更が検出されました")
        elif diff_area > 10000:
            print("※ 中程度の変更が検出されました")
        else:
            print("※ 小規模な変更が検出されました")
        
    finally:
        # テスト用ファイルの削除
        try:
            os.remove("test1.html")
            os.remove("test2.html")
        except Exception as e:
            print(f"ファイル削除時のエラー: {e}")

test_basic_diff()

次のようなテストようのページを比較する

標準出力の例:

検出結果:
- 検出された差分の数: 1
- 差分の合計面積: 39601.0 ピクセル
- 結果画像: diff_results\diffs\diff_20241116_142914.png
※ 中程度の変更が検出されました

差分抽出の例:

発展実装：複雑なWebサイトへの対応

より複雑なWebサイトに対応するため、以下の機能を追加した発展実装を紹介します：

スクロール対応の全画面スクリーンショット
広告領域など特定エリアの除外機能
より柔軟な差分検出設定

class AdvancedWebPageDiffDetector(WebPageDiffDetector):
    def __init__(self, output_dir="diff_results"):
        super().__init__(output_dir)
        self.excluded_areas = []

    def add_excluded_area(self, x, y, width, height):
        """除外領域の追加"""
        self.excluded_areas.append({
            'x': x, 'y': y,
            'width': width, 'height': height
        })

    def capture_full_page_screenshot(self, url):
        """全画面スクリーンショット取得"""
        driver = self.setup_driver()
        try:
            driver.get(url)
            
            # ページの全高さを取得
            total_height = driver.execute_script(
                "return Math.max("
                "document.body.scrollHeight, "
                "document.documentElement.scrollHeight"
                ");"
            )
            viewport_height = driver.execute_script(
                "return window.innerHeight"
            )
            
            screenshots = []
            current_position = 0
            
            # スクロールしながらスクリーンショット取得
            while current_position < total_height:
                driver.execute_script(
                    f"window.scrollTo(0, {current_position});"
                )
                time.sleep(0.5)
                
                temp_path = os.path.join(
                    self.output_dir,
                    "screenshots",
                    f"temp_{current_position}.png"
                )
                driver.save_screenshot(temp_path)
                screenshots.append(temp_path)
                
                current_position += viewport_height
            
            return self._merge_screenshots(screenshots)
            
        finally:
            driver.quit()

    def _merge_screenshots(self, screenshot_paths):
        """スクリーンショットの結合"""
        images = [cv2.imread(path) for path in screenshot_paths]
        total_height = sum(img.shape[0] for img in images)
        max_width = max(img.shape[1] for img in images)
        
        merged = np.zeros(
            (total_height, max_width, 3), 
            dtype=np.uint8
        )
        
        current_y = 0
        for img in images:
            height = img.shape[0]
            merged[current_y:current_y+height, :img.shape[1]] = img
            current_y += height
        
        # 一時ファイルの削除
        for path in screenshot_paths:
            os.remove(path)
        
        # 結合画像の保存
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        result_path = os.path.join(
            self.output_dir,
            "screenshots",
            f"full_{timestamp}.png"
        )
        cv2.imwrite(result_path, merged)
        return result_path

実践的な使用方法と注意点

基本的なWebサイトの場合

シンプルなWebサイトであれば、基本実装で十分です：

detector = WebPageDiffDetector()
screenshot1 = detector.capture_screenshot("http://example.com")
time.sleep(60)  # 1分後に再度キャプチャ
screenshot2 = detector.capture_screenshot("http://example.com")
result, count, area = detector.compare_images(screenshot1, screenshot2)

複雑なWebサイトの場合

以下のようなケースでは発展実装の使用を推奨します：

長いスクロールページの場合

detector = AdvancedWebPageDiffDetector()
screenshot1 = detector.capture_full_page_screenshot("http://example.com")

広告領域を除外したい場合

detector = AdvancedWebPageDiffDetector()
detector.add_excluded_area(100, 200, 300, 250)  # 広告領域を除外

注意が必要なケース

以下のような場合は、さらなる改造が必要になる可能性があります：

動的コンテンツの多いサイト
- アニメーションや自動更新される要素がある場合
- 解決策：待機時間の調整やJavaScript実行の制御が必要
レスポンシブデザインのサイト
- 画面サイズによってレイアウトが大きく変わる場合
- 解決策：固定のビューポートサイズを設定
複雑なインタラクションが必要なサイト
- ログインが必要
- モーダルウィンドウの処理が必要
- 解決策：Seleniumの高度な操作機能の実装
非同期読み込みコンテンツ
- 遅延読み込みされる画像やコンテンツがある場合
- 解決策：明示的な待機処理の実装

まとめ

この記事では、PythonとSeleniumを使用したWebページ差分検出システムの実装方法を紹介しました。基本実装から発展実装まで、段階的に機能を追加していく方法を解説しました。

実際の運用では、対象となるWebサイトの特性に応じて適切な実装を選択し、必要に応じてカスタマイズを行うことが重要です。また、より複雑なサイトに対応する場合は、本記事で紹介した発展実装をベースに、さらなる機能追加を検討してください。