「わざわざWhisperとGPT-4とFunction callingでエアコンを消す」音声入力編

Last updated at 2024-03-07Posted at 2024-03-07

はじめに

前にこんな記事を書きました。

内容としてはWhisperで音声ファイルの内容をテキスト化→GPT-4でテキストを具体的な指示に加工→Function callingで実行する処理決定という流れのサンプルを紹介した、というものです。

この記事では拾ってきた音声ファイルを元にサンプルを動かしていましたが、今回は音声入力編ということで自分の声で使えるようにしてみました。

具体的流れは、前回の流れの前にPyAudio、waveを使ってマイクからの音声を.wavファイル化というステップが入るだけです。
これ以降の内容に興味がある方は前記事を見ていただけると幸いです。

今回のサンプルはM1 Macbookで動作させています。マイクに関してはMacbook内蔵のもの（デフォルトのもの）を使っているためサンプル上では明示的に設定していません。

また、サンプルのスクリプトの中身も前の記事の内容をベースにしています。
以下が追記部分です。

1. 追加で必要なモジュールのimport

import pyaudio
import wave
import threading
import os

2. 録音開始、録音、録音停止処理の追加

# 録音のパラメータ設定
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
WAVE_OUTPUT_FILENAME = "./audio/output.wav"

frames = []
recording = False

def record_audio():
    global recording
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

    while recording:
        data = stream.read(CHUNK, exception_on_overflow=False)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

def start_recording():
    global recording, frames
    recording = True
    frames = []  # 録音を開始する前にフレームをクリア
    thread = threading.Thread(target=record_audio)
    thread.start()

def stop_recording():
    global recording
    recording = False

3. 音声入力開始からファイル保存までの処理の追加
エンターキーを押すと録音が開始され、もう一度押すと停止します。

if __name__ == '__main__':
    print("-- 0. 音声入力の受付 --")
    output_directory = os.path.dirname(WAVE_OUTPUT_FILENAME)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
    # 録音
    input("エンターキーを押して録音を開始してください...")
    start_recording()
    print("【録音中】録音を停止するにはもう一度エンターキーを押してください...")
    input()  # エンターキー待ちで録音停止
    stop_recording()
    print("録音終了。ファイルを保存しています...")
    #　音声ファイルの保存
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(pyaudio.PyAudio().get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print("ファイルが保存されました: {}".format(WAVE_OUTPUT_FILENAME))
    print("")

全ての処理を含んだサンプルスクリプトも置いておきます↓。長いので閉じてます。

　極力サンプルとしてコードをシンプルに保つためエラーハンドリングをしていませんし、実行できる関数も「エアコンを消す」ものだけです。必要な場合は適宜追加・修正してください。

　サンプルスクリプト（全体版）

import requests
import pyaudio
import wave
import threading
import os

OPENAI_API_KEY = "your-api-key"
WHISPER_ENDPOINT = "https://api.openai.com/v1/audio/transcriptions"
CHAT_ENDPOINT = "https://api.openai.com/v1/chat/completions"

# 録音のパラメータ設定
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
WAVE_OUTPUT_FILENAME = "./audio/output.wav"

frames = []
recording = False

def record_audio():
    global recording
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

    while recording:
        data = stream.read(CHUNK, exception_on_overflow=False)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

def start_recording():
    global recording, frames
    recording = True
    frames = []  # 録音を開始する前にフレームをクリア
    thread = threading.Thread(target=record_audio)
    thread.start()

def stop_recording():
    global recording
    recording = False

def create_transcription(audio_file):
    headers = {
        "Authorization": "Bearer " + OPENAI_API_KEY
    }
    files = {
        "file": ("airconwokeshimasu_01.wav", open(audio_file, "rb")),
        "model": (None, "whisper-1")
    }
    response = requests.post(WHISPER_ENDPOINT, headers=headers, files=files)
    return response.json()

def transform_transcription(transcription):
    headers = {
        "Authorization": "Bearer " + OPENAI_API_KEY
    }
    messages = [
        {
            "role": "system",
            "content": "与えられたテキストを指示通りに加工してください。"
        },
        {
            "role": "user",
            "content": "テキストに動詞が入っていた場合、より具体的な指示に変換してください: " + transcription["text"]
        }
    ]
    body = {
        "model": "gpt-4",
        "messages": messages
    }
    response = requests.post(CHAT_ENDPOINT, headers=headers, json=body)
    return response.json()

### 選択肢となる関数 ###
def operate_air_conditioner():
    ### ↓　架空の処理です。イメージです。
    # sample_endpoint = "https://sample-smart-air-conditionar.com"
    # headers = {
    #     "Authorization": "xxxxyyyyzzz"
    # }
    # body = {
    #     "operation": "turn-off"
    # }
    # response = requests.post(sample_endpoint, headers=headers, json=body)
    # return response.json()
    print("success: Air conditioner is turned off.")

def operate_floor_heating():
    ###　こちらは処理イメージは割愛します。
    print("success: floor N, floor heating is turned off.")

### 処理を選択する関数 ###
def select_function(instruction):
    headers = {
        "Authorization": "Bearer " + OPENAI_API_KEY
    }
    messages = [
        {
            "role": "system",
            "content": "あなたはスマート家電のオペレーターです。様々な家電に関する操作指示を受けます。"
        },
        {
            "role": "user",
            "content": instruction
        }
    ]
    functions = [
        {
            "name": "operate_air_conditioner",
            "description": "エアコンの操作を行う"
        },
        {
            "name": "operate_floor_heating",
            "description": "床暖房の操作を行う"
        }    
    ]
    function_call = "auto"
    body = {
        "model": "gpt-4",
        "messages": messages,
        "functions": functions,
        "function_call": function_call
    }
    response = requests.post(CHAT_ENDPOINT, headers=headers, json=body)
    return response.json()

if __name__ == '__main__':
    print("-- 0. 音声入力の受付 --")
    output_directory = os.path.dirname(WAVE_OUTPUT_FILENAME)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
    # 録音
    input("エンターキーを押して録音を開始してください...")
    start_recording()
    print("【録音中】録音を停止するにはもう一度エンターキーを押してください...")
    input()  # エンターキー待ちで録音停止
    stop_recording()
    print("録音終了。ファイルを保存しています...")
    #　音声ファイルの保存
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(pyaudio.PyAudio().get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print("ファイルが保存されました: {}".format(WAVE_OUTPUT_FILENAME))
    print("")
    # 音声をテキスト化
    transcription = create_transcription(WAVE_OUTPUT_FILENAME)
    print("-- 1. 音声をテキスト化 --")
    print(transcription)
    print("")
    # テキストを加工
    transformed_transcription = transform_transcription(transcription)
    instruction = transformed_transcription["choices"][0]["message"]["content"]
    print("-- 2. テキストを加工 --")
    print(instruction)
    print("")
    # 処理を選択
    selected_result = select_function(instruction)
    selected_function = selected_result["choices"][0]["message"]["function_call"]["name"]
    print("-- 3. 処理を選択 --")
    print("selected function: " + selected_function)
    print("details:")
    print(selected_result)
    print("")
    # 処理実行
    print("-- 4. 処理を実行 --")
    if selected_function == "operate_air_conditioner":
        operate_air_conditioner()

動作結果

私が「冷えてきたので、エアコンを消してください。」と喋った時の例です。

-- 0. 音声入力の受付 --
エンターキーを押して録音を開始してください...
【録音中】録音を停止するにはもう一度エンターキーを押してください...

録音終了。ファイルを保存しています...
ファイルが保存されました: ./audio/output.wav

-- 1. 音声をテキスト化 --
{'text': '冷えてきたので、エアコンを消してください。'}

-- 2. テキストを加工 --
寒さを感じ始めたので、エアコンの電源を切ってください。

-- 3. 処理を選択 --
selected function: operate_air_conditioner
details:
{'id': 'chatcmpl-xxxxxxxxyyyyyyyyzzzzzzz', 'object': 'chat.completion', 'created': 1707974787, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': None, 'function_call': {'name': 'operate_air_conditioner', 'arguments': '{\n  "action": "turn off"\n}'}}, 'logprobs': None, 'finish_reason': 'function_call'}], 'usage': {'prompt_tokens': 134, 'completion_tokens': 19, 'total_tokens': 153}, 'system_fingerprint': None}

-- 4. 処理を実行 --
success: Air conditioner is turned off.

参考

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up