以前にOpenRouterを使って AIエージェントもどき(簡単なfunction callingを機能として持たせた)を実装しました。
しかし、実際に実装してみるとOpenRouterは思ったよりもブラックボックスで、「実際にどんなテキストがLLMに渡っているのか」を知ることはできませんでした。
そのため今回はHarmony Response Formatを利用してスクリプト側でプロンプトを完全にレンダリングしてからLLMを呼び出すことを目指しました。chat_template相当の処理を自分で実装した、と言い換えることもできます。
結果として以下を利用することになりました。
- Harmony Response Formatとの相性でgpt-oss(20b)
- 生のテキストを投げられるのでollama(raw mode)
コード
import argparse
import re
import sys
from typing import Any, Dict, Tuple
import httpx
from openai_harmony import (
load_harmony_encoding,
HarmonyEncodingName,
Role,
Message,
Conversation,
SystemContent,
DeveloperContent,
)
OLLAMA_BASE_URL = "http://localhost:11434"
MODEL_NAME = "gpt-oss:latest" # hard-coded
# --- Harmony completion parsing ---
# gpt-oss may emit Harmony-tagged outputs like:
# <|channel|>analysis<|message|>...<|end|><|start|>assistant<|channel|>final<|message|>...
# We want to show only the final text as `ai>` while keeping the raw completion for debugging.
_FINAL_RE = re.compile(
r"<\|start\|>assistant<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)",
re.DOTALL,
)
_ANALYSIS_RE = re.compile(
r"<\|channel\|>analysis<\|message\|>(.*?)(?:<\|end\|><\|start\|>assistant<\|channel\|>final<\|message\|>|$)",
re.DOTALL,
)
def parse_harmony_assistant_text(text: str) -> Tuple[str, str]:
"""Return (final, analysis). If tags are absent, final=text, analysis=''."""
final_m = _FINAL_RE.search(text)
analysis_m = _ANALYSIS_RE.search(text)
final = final_m.group(1).strip() if final_m else text.strip()
analysis = analysis_m.group(1).strip() if analysis_m else ""
return final, analysis
def _system_content_from_text(text: str) -> SystemContent:
sc = SystemContent.new()
with_instructions = getattr(sc, "with_instructions", None)
if callable(with_instructions) and text:
return with_instructions(text)
return sc
def _developer_content_from_text(text: str) -> DeveloperContent:
dc = DeveloperContent.new()
with_instructions = getattr(dc, "with_instructions", None)
if callable(with_instructions) and text:
return with_instructions(text)
return dc
def build_prompt(history, enc) -> str:
messages = []
for msg in history:
role = msg.get("role")
content = msg.get("content", "")
if role == "system":
messages.append(
Message.from_role_and_content(
Role.SYSTEM, _system_content_from_text(content))
)
elif role == "developer":
messages.append(
Message.from_role_and_content(
Role.DEVELOPER, _developer_content_from_text(content))
)
elif role == "user":
messages.append(Message.from_role_and_content(Role.USER, content))
elif role == "assistant":
messages.append(Message.from_role_and_content(
Role.ASSISTANT, content))
else:
messages.append(Message.from_role_and_content(
Role.USER, str(content)))
convo = Conversation.from_messages(messages)
rendered = enc.render_conversation_for_completion(convo, Role.ASSISTANT)
if isinstance(rendered, str):
return rendered
# If token ids are returned, try to decode via encoding hooks.
decode = getattr(enc, "decode", None)
if callable(decode):
return decode(rendered)
encoding = getattr(enc, "encoding", None)
if encoding is not None:
encoding_decode = getattr(encoding, "decode", None)
if callable(encoding_decode):
return encoding_decode(rendered)
raise TypeError(
"Harmony rendering returned unsupported type; can't convert to prompt string.")
def ollama_generate(prompt: str, max_tokens: int, temperature: float, top_p: float) -> str:
url = f"{OLLAMA_BASE_URL}/api/generate"
# raw=True disables Ollama prompt templating; you provide the *full* prompt.
# In raw mode Ollama doesn't return "context", so history is fully yours.
payload: Dict[str, Any] = {
"model": MODEL_NAME,
"prompt": prompt,
"raw": True,
"stream": False,
"options": {
"num_predict": max_tokens,
"temperature": temperature,
"top_p": top_p,
},
}
with httpx.Client(timeout=600.0) as client:
r = client.post(url, json=payload)
r.raise_for_status()
data = r.json()
# /api/generate returns {"response": "...", ...}
return data.get("response", "")
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Ollama chat using openai/harmony (raw prompt mode).")
p.add_argument("--max-tokens", type=int, default=256,
help="Maximum tokens to generate.")
p.add_argument("--temperature", type=float, default=0.4,
help="Sampling temperature.")
p.add_argument("--top-p", type=float, default=1.0,
help="Top-p nucleus sampling.")
p.add_argument(
"--debug",
action="store_true",
help="Print the Harmony-rendered prompt and the raw Harmony completion.",
)
return p.parse_args()
def main() -> None:
args = parse_args()
# Harmony (gpt-oss) formatting rules.
enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
history = [
{"role": "system", "content": "You are a helpful assistant."},
# 例えば開発者指示を入れたいなら:
# {"role": "developer", "content": "Talk like a pirate!"},
]
print(f"ollama chat (model={MODEL_NAME}) (type 'exit' to quit)")
while True:
try:
user_input = input("you> ").strip()
except (KeyboardInterrupt, EOFError):
print("\nbye")
return
if not user_input:
continue
if user_input.lower() in {"exit", "quit"}:
print("bye")
return
history.append({"role": "user", "content": user_input})
prompt = build_prompt(history, enc)
if args.debug:
print("\n--- HARMONY PROMPT (BEGIN) ---")
print(prompt)
print("--- HARMONY PROMPT (END) ---\n")
try:
resp = ollama_generate(
prompt=prompt,
max_tokens=args.max_tokens,
temperature=args.temperature,
top_p=args.top_p,
)
except httpx.HTTPError as e:
print(f"error: {e}", file=sys.stderr)
# rollback last user turn on failure
history.pop()
continue
final, analysis = parse_harmony_assistant_text(resp)
if args.debug:
print("\n--- HARMONY COMPLETION (BEGIN) ---")
print(resp)
print("--- HARMONY COMPLETION (END) ---\n")
if analysis:
print("--- PARSED ANALYSIS (BEGIN) ---")
print(analysis)
print("--- PARSED ANALYSIS (END) ---\n")
history.append({"role": "assistant", "content": final})
print(f"ai> {final}\n")
if __name__ == "__main__":
main()