More than 1 year has passed since last update.

【Windows】【Python】OpenCVでAudio読み込みお試し

Last updated at 2023-12-11Posted at 2023-12-09

この記事はOpenCV Advent Calendar 2023の10日目の記事です。

2年前だったか、3年前だったか、OpenCVにAudioIOサポート（試験的機能？）が追加されましたね。
たしか、将来的に、画像と音声などを扱うマルチモーダルAIの分野が伸びることを予想して追加された。みたいな背景だったような。
※色々うろ覚えで書いているので間違っていたご指摘を🙇🏻

ドキュメントが全然無かったり、Windowsしか対応していなかったりしますが、ちょっと触ってみます👀
この投稿は、OpenCV 4.8.1.78 で試しています。

マイク入力

映像入力でおなじみの VideoCapture を利用します。
アレやコレやパラメータとサンプリングレートを指定してopen()します。
open()の第1引数がデバイス番号ですね。

# マイク入力準備
cap = cv2.VideoCapture()
sampling_rate = 44100
params = np.asarray([
    cv2.CAP_PROP_AUDIO_STREAM, 0, cv2.CAP_PROP_VIDEO_STREAM, -1,
    cv2.CAP_PROP_AUDIO_DATA_DEPTH, cv2.CV_32F,
    cv2.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, sampling_rate
])
cap.open(0, cv2.CAP_ANY, params)
audio_base_index = int(cap.get(cv2.CAP_PROP_AUDIO_BASE_INDEX))

grab()で次フレームのデータをバッファに入れて、retrieve()でバッファから取得します。

while cap.isOpened():
    if not cap.grab():
        break
    # オーディオフレーム取得
    frame = np.asarray([])
    ret, frame = cap.retrieve(frame, audio_base_index)
    if not ret:
        break

上の2つとともに、かなり雑なデバッグ描画をくっつけたのが以下。
※波形もcv2.line()で描画しているので若干重いです。ちゃんとしたプログラム作るときには使用しないほうが良いです🦔

from collections import deque

import cv2
import numpy as np


def draw_audio_waveform(
        sampling_data,
        width=854,
        height=240,
        thickness=2,
        bg_color=(255, 255, 255),
        plot_color=(255, 0, 0),
):
    original_width = len(sampling_data)
    center_y = int(height / 2)

    image = np.zeros((height, width, 3))
    cv2.rectangle(
        image,
        (0, 0),
        (original_width, height),
        bg_color,
        thickness=-1,
    )

    prev_value = None
    for index, point in enumerate(sampling_data):
        value = center_y + int(point * height)
        if prev_value is not None:
            cv2.line(
                image,
                (index - 1, prev_value),
                (index, value),
                plot_color,
                thickness=thickness,
                lineType=cv2.LINE_8,
            )
        prev_value = value

    image = cv2.resize(image, (width, height))

    return image


# マイク入力準備
cap = cv2.VideoCapture()
sampling_rate = 44100
params = np.asarray([
    cv2.CAP_PROP_AUDIO_STREAM, 0, cv2.CAP_PROP_VIDEO_STREAM, -1,
    cv2.CAP_PROP_AUDIO_DATA_DEPTH, cv2.CV_32F,
    cv2.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, sampling_rate
])
cap.open(0, cv2.CAP_ANY, params)
audio_base_index = int(cap.get(cv2.CAP_PROP_AUDIO_BASE_INDEX))

input_audio = deque([], maxlen=sampling_rate * 5)  # 5秒分のバッファ
while cap.isOpened():
    if not cap.grab():
        break

    # オーディオフレーム取得
    frame = np.asarray([])
    ret, frame = cap.retrieve(frame, audio_base_index)
    if not ret:
        break

    for i in range(len(frame[0])):
        input_audio.append(frame[0][i])

    # デバッグ描画
    debug_frame = list(input_audio)[-500:]
    debug_image = draw_audio_waveform(
        debug_frame,
        width=len(debug_frame),
    )
    cv2.imshow('test', debug_image)
    key = cv2.waitKey(1)
    if key == 27:  # ESC
        break

cv2.destroyAllWindows()

上記スクリプトを動かすと、マイク入力に問題無ければ、以下のような表示がされると思います。

オーディオファイル入力

open()の第1引数にデバイス番号ではなく、ファイルパスを指定します👀

# オーディオファイル入力準備
audio_file_path = 'sample.wav'
cap = cv2.VideoCapture()
samplingRate = 44100
params = np.asarray([
    cv2.CAP_PROP_AUDIO_STREAM, 0, cv2.CAP_PROP_VIDEO_STREAM, -1,
    cv2.CAP_PROP_AUDIO_DATA_DEPTH, cv2.CV_32F,
    cv2.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
])
cap.open(audio_file_path, cv2.CAP_ANY, params)

Webカメラ入力 & マイク入力

これ使い方自信ないのですが、、、
それぞれ VideoCapture で開いて使用する感じ。。。かなー🤔
~~何かもっとスマートな設計にしてる気がするので、冬休みに時間あればOpenCVのソースコード追って、もう少しパラメータ指定確認します👀~~
2023/12/10追記：Webカメラ入力とマイク入力は、そもそも本質的には別デバイスなので、おそらく同じVideoCaptureからは取得できない

from collections import deque

import cv2
import numpy as np


def draw_audio_waveform(
        sampling_data,
        width=854,
        height=240,
        thickness=2,
        bg_color=(255, 255, 255),
        plot_color=(255, 0, 0),
):
    original_width = len(sampling_data)
    center_y = int(height / 2)

    image = np.zeros((height, width, 3))
    cv2.rectangle(
        image,
        (0, 0),
        (original_width, height),
        bg_color,
        thickness=-1,
    )

    prev_value = None
    for index, point in enumerate(sampling_data):
        value = center_y + int(point * height)
        if prev_value is not None:
            cv2.line(
                image,
                (index - 1, prev_value),
                (index, value),
                plot_color,
                thickness=thickness,
                lineType=cv2.LINE_8,
            )
        prev_value = value

    image = cv2.resize(image, (width, height))

    return image


# マイク入力準備
audio_cap = cv2.VideoCapture()
sampling_rate = 16000
params = np.asarray([
    cv2.CAP_PROP_AUDIO_STREAM, 0, cv2.CAP_PROP_VIDEO_STREAM, -1,
    cv2.CAP_PROP_AUDIO_DATA_DEPTH, cv2.CV_32F,
    cv2.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, sampling_rate
])
audio_cap.open(0, cv2.CAP_ANY, params)
audio_base_index = int(audio_cap.get(cv2.CAP_PROP_AUDIO_BASE_INDEX))

# Webカメラ準備
webcam_cap = cv2.VideoCapture(0)

input_audio = deque([], maxlen=sampling_rate * 5)  # 5秒分のバッファ
while audio_cap.isOpened():
    if not audio_cap.grab():
        break

    # オーディオフレーム取得
    frame = np.asarray([])
    ret, frame = audio_cap.retrieve(frame, audio_base_index)
    if not ret:
        break

    for i in range(len(frame[0])):
        input_audio.append(frame[0][i])

    # Webカメラ画像取得
    ret, webcam_image = webcam_cap.read()
    if not ret:
        break

    # デバッグ描画
    debug_frame = list(input_audio)[-500:]
    debug_image = draw_audio_waveform(
        debug_frame,
        width=len(debug_frame),
    )
    cv2.imshow('audio', debug_image)
    cv2.imshow('wecam', webcam_image)
    key = cv2.waitKey(1)
    if key == 27:  # ESC
        break

cv2.destroyAllWindows()

2023/12/10追記：動画ファイル入力(画像 & 音声)

Pythonサンプルばかり探して目に入っていませんでしたが、熊太郎さんがC++のサンプルのありかを教えてくれました🦔

Pythonでも同様の使い方で、動画ファイルから映像フレームと音声チャンネル、それぞれが取得できました👀

from collections import deque

import cv2
import numpy as np


def draw_audio_waveform(
        sampling_data,
        width=854,
        height=240,
        thickness=2,
        bg_color=(255, 255, 255),
        plot_color=(255, 0, 0),
):
    original_width = len(sampling_data)
    center_y = int(height / 2)

    image = np.zeros((height, width, 3))
    cv2.rectangle(
        image,
        (0, 0),
        (original_width, height),
        bg_color,
        thickness=-1,
    )

    prev_value = None
    for index, point in enumerate(sampling_data):
        value = center_y + int(point * height)
        if prev_value is not None:
            cv2.line(
                image,
                (index - 1, prev_value),
                (index, value),
                plot_color,
                thickness=thickness,
                lineType=cv2.LINE_8,
            )
        prev_value = value

    image = cv2.resize(image, (width, height))

    return image


# 動画ファイル入力準備
audio_file_path = 'sample.mp4'
cap = cv2.VideoCapture()
sampling_rate = 44100
params = np.asarray([
    cv2.CAP_PROP_AUDIO_STREAM, 0, cv2.CAP_PROP_VIDEO_STREAM, 0,
    cv2.CAP_PROP_AUDIO_DATA_DEPTH, cv2.CV_32F,
    cv2.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, sampling_rate
])
cap.open(audio_file_path, cv2.CAP_ANY, params)

audio_base_index = int(cap.get(cv2.CAP_PROP_AUDIO_BASE_INDEX))
audio_total_channels = int(cap.get(cv2.CAP_PROP_AUDIO_TOTAL_CHANNELS))

input_audio_list = []
for index in range(audio_total_channels):
    input_audio_list.append(deque([], maxlen=sampling_rate * 5))

while cap.isOpened():
    if not cap.grab():
        break

    # 映像フレーム取得
    ret, image = cap.retrieve()
    if not ret:
        break

    # 音声フレーム取得
    for index in range(audio_total_channels):
        audio_frame = np.asarray([])
        ret, audio_frame = cap.retrieve(audio_frame, audio_base_index + index)
        for i in range(len(audio_frame[0])):
            input_audio_list[index].append(audio_frame[0][i])

    # デバッグ描画
    for index in range(audio_total_channels):
        debug_frame = list(input_audio_list[index])[-500:]
        debug_image = draw_audio_waveform(
            debug_frame,
            width=len(debug_frame),
        )
        cv2.imshow('channel:' + str(index + 1), debug_image)
        del debug_image
    cv2.imshow('image', image)
    key = cv2.waitKey(1)
    if key == 27:  # ESC
        break

cv2.destroyAllWindows()

以上。

OpenCVAdvent Calendar 2023

Day 10

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up