More than 3 years have passed since last update.

エロ動画のシーンを音声から分析するシリーズ 4: 転移学習を行う（声とその他に分けるモデルを作る）

Last updated at 2022-05-22Posted at 2022-05-22

前回のあらすじ

前回は転移学習用のデータセットを作った

今回は、そのデータセットから転移学習を行う

ちなみに、このシリーズ一貫した目的は「あえぎ声」と「会話」を分離すること

軽く計画

以下のようなディレクトリ構成になっている

dataset
- speech
- moan
- env_sound

speech と moan のデータセットを合わせて voice クラス、 env_sound を other クラスとして学習して、 voice_and_other_model として保存する

早速コード書く

import sys
import ffmpeg
import csv
from scipy.io import wavfile
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import datetime
import math
import shutil
from pytimeparse.timeparse import timeparse
import tensorflow_io as tfio
import glob

# YAMNet の学習済みモデルをロード
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# (filename, class) のデータセットを作る
voice_wav_filenames = glob.glob('dataset/speech/*.wav') + glob.glob('dataset/moan/*.wav')
other_wav_filenames = glob.glob('dataset/env_sound/*.wav')
filenames = voice_wav_filenames + other_wav_filenames
classes = np.hstack([np.repeat(0, len(voice_wav_filenames)), np.repeat(1, len(other_wav_filenames))])
class_names = ['voice', 'other']
class_name_dict = { 'voice': 0, 'other': 1 }
main_ds = tf.data.Dataset.from_tensor_slices((filenames, classes))
dataset_size = len(filenames)

# wav_data を読み込む
@tf.function
def load_wav_16k_mono(filename):
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

def load_wav_for_map(filename, class_id):
    wav = load_wav_16k_mono(filename)
    return wav, class_id

main_ds = main_ds.map(load_wav_for_map)

# YAMNet を適用
def extract_embedding(wav_data, class_id):
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings, tf.repeat(class_id, num_embeddings))
main_ds = main_ds.map(extract_embedding)
main_ds = main_ds.unbatch()

# training, validation, testing に分ける
cached_ds = main_ds.cache().shuffle(dataset_size)
train_size = int(0.7 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = int(0.15 * dataset_size)

train_ds = cached_ds.take(train_size)
remain_ds = cached_ds.skip(train_size)
val_ds = remain_ds.take(val_size)
test_ds = remain_ds.skip(val_size)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

# 転移学習用のレイヤーを用意
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32, name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(class_names))
], name='my_model')
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=['accuracy'])

# 改善がなければ終了する
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

# トレーニング
history = my_model.fit(train_ds, epochs=30, validation_data=val_ds, callbacks=callback)

# 評価
loss, accuracy = my_model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

# 保存
my_model.save('voice_and_other_model')

結果

いい感じで学習できた

次は

「声」として分類された音声を、さらに「会話」と「あえぎ声」に分けるモデルを作る

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up