ollama+gpt-ossをHarmony Response Formatで扱う自作CLIチャット

Posted at 2026-01-02

以前にOpenRouterを使って AIエージェントもどき（簡単なfunction callingを機能として持たせた）を実装しました。

しかし、実際に実装してみるとOpenRouterは思ったよりもブラックボックスで、「実際にどんなテキストがLLMに渡っているのか」を知ることはできませんでした。

そのため今回はHarmony Response Formatを利用してスクリプト側でプロンプトを完全にレンダリングしてからLLMを呼び出すことを目指しました。chat_template相当の処理を自分で実装した、と言い換えることもできます。

結果として以下を利用することになりました。

Harmony Response Formatとの相性でgpt-oss(20b)
生のテキストを投げられるのでollama(raw mode)

コード

import argparse
import re
import sys
from typing import Any, Dict, Tuple

import httpx

from openai_harmony import (
    load_harmony_encoding,
    HarmonyEncodingName,
    Role,
    Message,
    Conversation,
    SystemContent,
    DeveloperContent,
)

OLLAMA_BASE_URL = "http://localhost:11434"
MODEL_NAME = "gpt-oss:latest"  # hard-coded

# --- Harmony completion parsing ---
# gpt-oss may emit Harmony-tagged outputs like:
# <|channel|>analysis<|message|>...<|end|><|start|>assistant<|channel|>final<|message|>...
# We want to show only the final text as `ai>` while keeping the raw completion for debugging.
_FINAL_RE = re.compile(
    r"<\|start\|>assistant<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)",
    re.DOTALL,
)
_ANALYSIS_RE = re.compile(
    r"<\|channel\|>analysis<\|message\|>(.*?)(?:<\|end\|><\|start\|>assistant<\|channel\|>final<\|message\|>|$)",
    re.DOTALL,
)


def parse_harmony_assistant_text(text: str) -> Tuple[str, str]:
    """Return (final, analysis). If tags are absent, final=text, analysis=''."""
    final_m = _FINAL_RE.search(text)
    analysis_m = _ANALYSIS_RE.search(text)

    final = final_m.group(1).strip() if final_m else text.strip()
    analysis = analysis_m.group(1).strip() if analysis_m else ""
    return final, analysis


def _system_content_from_text(text: str) -> SystemContent:
    sc = SystemContent.new()
    with_instructions = getattr(sc, "with_instructions", None)
    if callable(with_instructions) and text:
        return with_instructions(text)
    return sc


def _developer_content_from_text(text: str) -> DeveloperContent:
    dc = DeveloperContent.new()
    with_instructions = getattr(dc, "with_instructions", None)
    if callable(with_instructions) and text:
        return with_instructions(text)
    return dc


def build_prompt(history, enc) -> str:
    messages = []
    for msg in history:
        role = msg.get("role")
        content = msg.get("content", "")

        if role == "system":
            messages.append(
                Message.from_role_and_content(
                    Role.SYSTEM, _system_content_from_text(content))
            )
        elif role == "developer":
            messages.append(
                Message.from_role_and_content(
                    Role.DEVELOPER, _developer_content_from_text(content))
            )
        elif role == "user":
            messages.append(Message.from_role_and_content(Role.USER, content))
        elif role == "assistant":
            messages.append(Message.from_role_and_content(
                Role.ASSISTANT, content))
        else:
            messages.append(Message.from_role_and_content(
                Role.USER, str(content)))

    convo = Conversation.from_messages(messages)
    rendered = enc.render_conversation_for_completion(convo, Role.ASSISTANT)

    if isinstance(rendered, str):
        return rendered

    # If token ids are returned, try to decode via encoding hooks.
    decode = getattr(enc, "decode", None)
    if callable(decode):
        return decode(rendered)

    encoding = getattr(enc, "encoding", None)
    if encoding is not None:
        encoding_decode = getattr(encoding, "decode", None)
        if callable(encoding_decode):
            return encoding_decode(rendered)

    raise TypeError(
        "Harmony rendering returned unsupported type; can't convert to prompt string.")


def ollama_generate(prompt: str, max_tokens: int, temperature: float, top_p: float) -> str:
    url = f"{OLLAMA_BASE_URL}/api/generate"

    # raw=True disables Ollama prompt templating; you provide the *full* prompt.
    # In raw mode Ollama doesn't return "context", so history is fully yours.
    payload: Dict[str, Any] = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "raw": True,
        "stream": False,
        "options": {
            "num_predict": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
        },
    }

    with httpx.Client(timeout=600.0) as client:
        r = client.post(url, json=payload)
        r.raise_for_status()
        data = r.json()

    # /api/generate returns {"response": "...", ...}
    return data.get("response", "")


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Ollama chat using openai/harmony (raw prompt mode).")
    p.add_argument("--max-tokens", type=int, default=256,
                   help="Maximum tokens to generate.")
    p.add_argument("--temperature", type=float, default=0.4,
                   help="Sampling temperature.")
    p.add_argument("--top-p", type=float, default=1.0,
                   help="Top-p nucleus sampling.")
    p.add_argument(
        "--debug",
        action="store_true",
        help="Print the Harmony-rendered prompt and the raw Harmony completion.",
    )
    return p.parse_args()


def main() -> None:
    args = parse_args()

    # Harmony (gpt-oss) formatting rules.
    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

    history = [
        {"role": "system", "content": "You are a helpful assistant."},
        # 例えば開発者指示を入れたいなら:
        # {"role": "developer", "content": "Talk like a pirate!"},
    ]

    print(f"ollama chat (model={MODEL_NAME}) (type 'exit' to quit)")
    while True:
        try:
            user_input = input("you> ").strip()
        except (KeyboardInterrupt, EOFError):
            print("\nbye")
            return

        if not user_input:
            continue
        if user_input.lower() in {"exit", "quit"}:
            print("bye")
            return

        history.append({"role": "user", "content": user_input})
        prompt = build_prompt(history, enc)

        if args.debug:
            print("\n--- HARMONY PROMPT (BEGIN) ---")
            print(prompt)
            print("--- HARMONY PROMPT (END) ---\n")

        try:
            resp = ollama_generate(
                prompt=prompt,
                max_tokens=args.max_tokens,
                temperature=args.temperature,
                top_p=args.top_p,
            )
        except httpx.HTTPError as e:
            print(f"error: {e}", file=sys.stderr)
            # rollback last user turn on failure
            history.pop()
            continue

        final, analysis = parse_harmony_assistant_text(resp)

        if args.debug:
            print("\n--- HARMONY COMPLETION (BEGIN) ---")
            print(resp)
            print("--- HARMONY COMPLETION (END) ---\n")
            if analysis:
                print("--- PARSED ANALYSIS (BEGIN) ---")
                print(analysis)
                print("--- PARSED ANALYSIS (END) ---\n")

        history.append({"role": "assistant", "content": final})
        print(f"ai> {final}\n")


if __name__ == "__main__":
    main()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up