0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

GitHubのリポジトリの構造とコードをひとつのテキストに集約する

Posted at

概要

GitHubのリポジトリの構造とコードを一つのテキストの集約して生成AIに渡したい。
以前はUithubというサービスを使っていたが、2025年11月16日時点では使えなくなっている。
今回はUithubに似たような機能を再開発する。

今回のコードでできること

URLと読み込む拡張子を編集して実行すると
image.png

ツリー構造と
image.png

拡張子に対応したファイルの中身をひとつのテキストにまとめる。
image.png

動機

  • 使えなくなったUithubの代用品を作って、利用したい

ソースコード(Pythonファイル)

import requests
from urllib.parse import urlparse

def fetch_tree(owner, repo, sha, token=None):
    """
    GitHub API で tree を再帰取得
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{sha}?recursive=1"

    headers = {}
    if token:
        headers["Authorization"] = f"token {token}"

    r = requests.get(url, headers=headers)
    r.raise_for_status()
    return r.json()["tree"]


def parse_github_url(url):
    """
    GitHub の URL を解析して {owner, repo, branch, path}
    URL が /tree/BRANCH を含まない場合、main ブランチ扱いにする
    """
    p = urlparse(url)
    parts = p.path.strip("/").split("/")

    if len(parts) < 2:
        raise ValueError("URLが不正です")

    owner = parts[0]
    repo = parts[1]

    # /tree/... がない場合は main をデフォルトにする
    if len(parts) == 2:
        return owner, repo, "main", ""

    # /tree/branch/... がある場合
    if len(parts) >= 4 and parts[2] == "tree":
        branch = parts[3]
        path = "/".join(parts[4:]) if len(parts) > 4 else ""
        return owner, repo, branch, path

    raise ValueError("URL形式が不正です(例: https://github.com/user/repo/tree/main/path )")


def print_tree(tree_items, base_path=""):
    """
    GitHub API から得た tree 配列を階層構造に変換して tree 表示
    """

    # ---- パス → 子要素リスト に変換 ----
    from collections import defaultdict

    children = defaultdict(list)
    for item in tree_items:
        children[item["path"].rsplit("/", 1)[0] if "/" in item["path"] else ""].append(item)

    def walk(path="", indent=""):
        items = sorted(children.get(path, []), key=lambda x: x["path"])
        for i, item in enumerate(items):
            last = (i == len(items) - 1)
            branch = "└── " if last else "├── "

            name = item["path"].split("/")[-1]
            print(indent + branch + name)

            if item["type"] == "tree":
                next_indent = indent + ("    " if last else "")
                walk(item["path"], next_indent)

    walk(base_path)


def print_tree_to_buffer(tree_items, base_path="", buf=None):
    from collections import defaultdict
    children = defaultdict(list)

    for item in tree_items:
        children[item["path"].rsplit("/", 1)[0] if "/" in item["path"] else ""].append(item)

    def walk(path="", indent=""):
        items = sorted(children.get(path, []), key=lambda x: x["path"])
        for i, item in enumerate(items):
            last = (i == len(items) - 1)
            branch = "└── " if last else "├── "
            name = item["path"].split("/")[-1]
            buf.append(indent + branch + name)

            if item["type"] == "tree":
                next_indent = indent + ("    " if last else "")
                walk(item["path"], next_indent)

    walk(base_path)


def github_tree_from_url(url, token=None, out_file="output.md", allow_exts=None):
    """
    GitHub URL → API → ツリー表示 + ファイル内容 → output.md に保存
    allow_exts: ["py", "java", ...] など指定された拡張子のみ中身取得
    """
    owner, repo, branch, path = parse_github_url(url)

    buf = []   # 出力バッファ

    # branch から SHA を取得
    url_ref = f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch}"
    headers = {"Authorization": f"token {token}"} if token else {}
    r = requests.get(url_ref, headers=headers)
    r.raise_for_status()
    sha = r.json()["object"]["sha"]

    # 再帰 tree を取得
    tree = fetch_tree(owner, repo, sha, token=token)

    # 指定パスで絞る
    if path:
        tree = [t for t in tree if t["path"].startswith(path)]
        buf.append(path)
        print_tree_to_buffer(tree, base_path=path, buf=buf)
    else:
        buf.append(f"{repo}/")
        print_tree_to_buffer(tree, buf=buf)

    buf.append("\n=== File List ===")

    for item in tree:
        if item["type"] != "blob":
            continue

        file_path = item["path"]
        ext = "." + file_path.split(".")[-1] if "." in file_path else ""

        # ---------- 追加:拡張子フィルタ ----------
        if allow_exts is not None and ext not in allow_exts:
            buf.append(f"\n------------------------------")
            buf.append(f"{file_path}(※中身取得しない:拡張子 {ext}")
            buf.append(f"------------------------------")
            continue
        # ----------------------------------------

        raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"

        buf.append("\n------------------------------")
        buf.append(file_path)
        buf.append("------------------------------")

        try:
            resp = requests.get(raw_url)
            resp.raise_for_status()
            text = resp.text

            for i, line in enumerate(text.splitlines(), start=1):
                buf.append(f"{i:<4}| {line}")

        except Exception as e:
            buf.append(f"[ERROR] ファイル内容取得に失敗: {e}")

    # ---- 保存 ----
    with open(out_file, "w", encoding="utf-8") as f:
        f.write("\n".join(buf))

    print(f"✔ 出力完了: {out_file}")

# --------------------------------------------
# 編集箇所
# --------------------------------------------
allow_exts = [".py", ".java"]
url = "https://github.com/microsoft/markitdown"
github_tree_from_url(url, allow_exts = allow_exts)
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?