[備忘録] Google Gemini TTS (プレビュー版) で料理オーダー情報を音声化してみた

Last updated at 2025-05-27Posted at 2025-05-27

はじめに

レストランやカフェでの注文確認や厨房への指示を音声で自動化できたら便利ですよね。Google Gemini 2.5 Flash TTS (プレビュー版) を使って、料理のオーダー情報から自然な音声を生成するシステムを作ってみました。

使用技術

Google Gemini 2.5 Flash (TTS機能) プレビュー版
Python
wave モジュール（音声ファイル保存用）

前提条件

Google AI Studio のAPIキーが必要

Google Colabでの実行

Google Colabを使用する場合は、以下のコードでセットアップできます：


# APIキーの設定（Colabのシークレット機能を使用推奨）
import os
from google.colab import userdata

# Colabのシークレットから取得
api_key = userdata.get('GEMINI_API_KEY')

# または直接設定（非推奨）
# api_key = "your-api-key-here"

os.environ['GEMINI_API_KEY'] = api_key

基本実装

1. 基本的な音声生成関数

from google import genai
from google.genai import types
import wave
import os

def create_wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
    """音声データをWAVファイルとして保存"""
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        wf.writeframes(pcm_data)

def generate_speech(text, voice_name='Kore', output_file='output.wav'):
    """テキストから音声を生成"""
    client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
    
    response = client.models.generate_content(
        model="gemini-2.5-flash-preview-tts",
        contents=text,
        config=types.GenerateContentConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name=voice_name,
                    )
                )
            ),
        )
    )
    
    audio_data = response.candidates[0].content.parts[0].inline_data.data
    create_wave_file(output_file, audio_data)
    print(f"音声ファイルを保存しました: {output_file}")
    return output_file

2. 料理オーダー情報の構造化

from dataclasses import dataclass
from typing import List, Optional
from datetime import datetime

@dataclass
class OrderItem:
    name: str
    quantity: int
    price: int
    options: Optional[List[str]] = None
    special_request: Optional[str] = None

@dataclass
class CustomerOrder:
    order_id: str
    table_number: int
    customer_name: Optional[str]
    items: List[OrderItem]
    total_amount: int
    order_time: datetime
    status: str = "received"

# サンプルオーダー
sample_order = CustomerOrder(
    order_id="ORDER_001",
    table_number=5,
    customer_name="田中様",
    items=[
        OrderItem(
            name="ハンバーガーセット", 
            quantity=2, 
            price=1200,
            options=["チーズ追加", "ポテトLサイズ"]
        ),
        OrderItem(
            name="コーヒー", 
            quantity=2, 
            price=400
        ),
        OrderItem(
            name="チーズケーキ", 
            quantity=1, 
            price=500,
            special_request="温めてください"
        )
    ],
    total_amount=2100,
    order_time=datetime.now()
)

3. オーダー情報から音声テキストを生成

def create_order_announcement(order: CustomerOrder, announcement_type: str = "confirmation"):
    """オーダー情報から音声用テキストを生成"""
    
    if announcement_type == "confirmation":
        # お客様向けの注文確認
        text = f"ご注文を確認いたします。"
        if order.customer_name:
            text += f"{order.customer_name}、"
        text += f"テーブル{order.table_number}番のお客様。"
        
        for item in order.items:
            text += f"{item.name}を{item.quantity}個、"
            if item.options:
                text += f"オプション：{', '.join(item.options)}、"
            if item.special_request:
                text += f"ご要望：{item.special_request}、"
        
        text += f"合計金額は{order.total_amount}円です。ありがとうございます。"
        
    elif announcement_type == "kitchen":
        # 厨房向けの指示
        text = f"厨房への連絡です。テーブル{order.table_number}番、オーダー番号{order.order_id}。"
        
        for item in order.items:
            text += f"{item.name}、{item.quantity}個。"
            if item.options:
                text += f"オプション：{', '.join(item.options)}。"
            if item.special_request:
                text += f"特別な要望：{item.special_request}。"
        
        text += "以上です。"
        
    elif announcement_type == "ready":
        # 料理完成のお知らせ
        text = f"お待たせいたしました。テーブル{order.table_number}番"
        if order.customer_name:
            text += f"、{order.customer_name}"
        text += f"のお客様。ご注文のお料理が完成いたしました。お席までお持ちいたします。"
    
    return text

def generate_order_audio(order: CustomerOrder, announcement_type: str = "confirmation"):
    """オーダー情報から音声ファイルを生成"""
    text = create_order_announcement(order, announcement_type)
    filename = f"order_{order.order_id}_{announcement_type}.wav"
    return generate_speech(text, voice_name='Kore', output_file=filename)

実際の使用例

基本的な使い方

# 注文確認の音声を生成
confirmation_file = generate_order_audio(sample_order, "confirmation")

# 厨房への指示音声を生成
kitchen_file = generate_order_audio(sample_order, "kitchen")

# 料理完成のお知らせ音声を生成
ready_file = generate_order_audio(sample_order, "ready")

# 音声を再生（環境を自動判定）
play_audio(confirmation_file)

Google Colabでの実行例

# 1. 全ての関数を定義（上記のコードをコピー）

# 2. サンプルオーダーで音声生成
sample_order = CustomerOrder(
    order_id="ORDER_001",
    table_number=5,
    customer_name="田中様",
    items=[
        OrderItem(name="ハンバーガーセット", quantity=2, price=1200),
        OrderItem(name="コーヒー", quantity=1, price=400)
    ],
    total_amount=1600,
    order_time=datetime.now()
)

# 3. 音声ファイル生成と再生
print("注文確認音声を生成中...")
confirmation_file = generate_order_audio(sample_order, "confirmation")
play_audio(confirmation_file)

print("厨房指示音声を生成中...")
kitchen_file = generate_order_audio(sample_order, "kitchen")
play_audio(kitchen_file)

音声の再生

from IPython.display import Audio, display

def play_audio(filename):
    """音声ファイルを再生"""
    # 音声ファイルを直接再生
    display(Audio(filename, autoplay=True))

def create_audio_player(filename):
    """音声プレイヤーウィジェットを作成"""
    return Audio(filename)

# 使用例
play_audio(confirmation_file)

# または、プレイヤーを表示
audio_player = create_audio_player(confirmation_file)
display(audio_player)

結構自然なか感じで文章を読んでくれる。音声作成まで若干時間はかかるがその時間が共用できるケースでは活用できる。資格勉強のドキュメメントの音声化など自動化したときにはいいかもしれない。

カスタマイズのポイント

音声の種類と口調の調整

# 利用可能な音声の例
voices = ['Kore', 'Charon', 'Fenrir', 'Puck']

def create_cheerful_announcement(order: CustomerOrder):
    """明るい口調でのアナウンス"""
    base_text = create_order_announcement(order, "confirmation")
    cheerful_prompt = f"以下のテキストを明るく元気な口調で読み上げてください：{base_text}"
    return generate_speech(cheerful_prompt, voice_name='Kore')

まとめ

Google Gemini TTSを使って、レストランの注文システムを想定した音声機能の組み込みを試してみました。
注文確認や店内アナウンス、多言語対応など、実際の現場でも役立つシーンが多く、アクセシビリティ向上にもつながります。

今回の例は一例ですが、皆さんの日常にも応用できるユースケースがきっと見つかるはずです。音声技術を活かして、より自然で親しみやすい体験をぜひ試してみてください。

NotebookLLMで音声ファイルを作成できるとは思いますが、今回はAPI経由でも生成できるのかという観点で試してみました。

参考リンク

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up