GitHubのPull RequestのFiles ChangedをMarkdown形式Diffに整形するPythonコード

Posted at 2025-08-03

今回のコードでできること

GitHubのPull RequestのFiles Changedを

このようなMarkdown形式Diffに整形

動機

生成AIを利用したレビュー時、必要な差分のみをテキスト化して渡したい
Files Changed画面上の差分を差分個所がわかる形で複数まとめて取得したい

使い方

Files Changed画面で「名前を付けて保存」を行う（保存前にLoad diffもしておく）
"GitDiffToMd.py"ファイルを作成してエディタでソースコードを張り付ける
"GitDiffToMd.py"をクリックして、指示に従って保存したHTMLファイルを指定する
"diff.md"にテキスト化した結果が入っているので、生成AIへの質問等で活用する

ソースコード（Pythonファイル）

from html.parser import HTMLParser
import os

class CopilotDiffEntryExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_target = False
        self.target_entries = []
        self.current_entry = []
        self.attrs = {}

    def handle_starttag(self, tag, attrs):
        if tag == "copilot-diff-entry":
            attr_dict = dict(attrs)
            if "data-file-path" in attr_dict:
                self.in_target = True
                self.current_entry = []
                self.attrs = attr_dict

        if self.in_target:
            attr_str = " ".join(f'{k}="{v}"' for k, v in attrs)
            self.current_entry.append(f"<{tag}{(' ' + attr_str) if attr_str else ''}>")

    def handle_endtag(self, tag):
        if self.in_target:
            self.current_entry.append(f"</{tag}>")
        if tag == "copilot-diff-entry":
            self.in_target = False
            self.target_entries.append(("".join(self.current_entry), self.attrs))
            self.current_entry = []
            self.attrs = {}

    def handle_data(self, data):
        if self.in_target:
            self.current_entry.append(data)

    def handle_startendtag(self, tag, attrs):
        if self.in_target:
            attr_str = " ".join(f'{k}="{v}"' for k, v in attrs)
            self.current_entry.append(f"<{tag}{(' ' + attr_str) if attr_str else ''}/>")

class GithubDiffHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_tr = False
        self.in_td = False
        self.td_class = []
        self.current_code = ""
        self.lines = []

    def handle_starttag(self, tag, attrs):
        if tag == "tr":
            self.in_tr = True
        elif tag == "td" and self.in_tr:
            self.in_td = True
            self.td_class = []
            for k, v in attrs:
                if k == "class":
                    self.td_class = v.split()
        elif tag == "span" and self.in_td:
            # Some code may be wrapped in <span>
            pass

    def handle_endtag(self, tag):
        if tag == "tr":
            self.in_tr = False
        elif tag == "td":
            if self.in_td:
                code = self.current_code.replace('\n', '').replace('\r', '')
                if "blob-code-addition" in self.td_class:
                    self.lines.append(f"+{code}")
                elif "blob-code-deletion" in self.td_class:
                    self.lines.append(f"-{code}")
                elif "blob-code-context" in self.td_class or "blob-code-inner" in self.td_class:
                    self.lines.append(f" {code}")
                self.current_code = ""
                self.td_class = []
            self.in_td = False

    def handle_data(self, data):
        if self.in_td:
            self.current_code += data

def github_diff_html_to_markdown_diff(html: str) -> str:
    parser = GithubDiffHTMLParser()
    parser.feed(html)
    return "```diff\n" + "\n".join(parser.lines) + "\n```"

def main():
    # 現在の作業ディレクトリを取得
    current_dir = os.getcwd()

    # HTMLファイルの入力を促す
    html_path = input("HTMLファイルを入力してください: ")

    # diff.mdの初期化
    output_file = os.path.join(current_dir, "diff.md")

    # HTMLファイルの存在チェック
    if not os.path.isfile(html_path):
        with open(output_file, "w", encoding="utf-8") as file_output:
            file_output.write("指定されたHTMLファイルは存在しません。\n")
        print("指定されたHTMLファイルは存在しません。")
        return
    
    with open(html_path, encoding="utf-8") as f:
        html = f.read()

    # copilot-diff-entry抽出
    entry_extractor = CopilotDiffEntryExtractor()
    entry_extractor.feed(html)
    file_entries = entry_extractor.target_entries

    with open(output_file, "w", encoding="utf-8") as file_output:
        for i, (entry_html, attrs) in enumerate(file_entries):
            # data-file-pathの値出力
            data_file_path = attrs.get("data-file-path", "")
            file_output.write(f"## {data_file_path}\n")
            file_output.write(github_diff_html_to_markdown_diff(entry_html))
            file_output.write("\n\n")


    # MD表示
    with open(output_file, "r", encoding="utf-8") as file_output:
        contents = file_output.read()
        print(contents)

    print(f"{len(file_entries)}個のdiffを保存")
    # 入力待ち
    input("続行するには何かキーを押してください...")

if __name__ == "__main__":
    main()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up