More than 3 years have passed since last update.

撮影した動画から取れ高っぽいところを自動で見つけたい

Posted at 2020-05-10

やりたいこと

実況動画を撮影する際に、数時間にもおよぶ動画データの中から撮れ高を探す必要があって辛いのでなんとかしたい

アプローチ

取れ高 = 盛り上がってるポイント = 音量が大きい部分という仮説に基づき、音量が大きい部分を抽出した
実際のデータをみてみると、後半は（つかれて？)全体的な声量が下がっていくという傾向がみられたので、音量の絶対値ではなく極大値（ピーク)を使うようにした
- 下記のコードでは切り替えられるようにしています。

ソース

入力

voice.wav: 音声だけのデータ (背景音を含まない)
full.mp3: 音声 + 背景音
full.mp4: 動画(音声を含んでいても含んでいなくても良い)

出力 (実際に使用する際は、下記の二つのファイルをffmpegや動画編集ソフトで結合してください）

cut.wav: 音声データ
cut.mp4: 音声抜きの動画

cut_movie.py

import datetime
import os

import cv2
import librosa
import numpy as np
import scipy


# いい感じにコンソール出力するための関数。なくてもよい。
def pretty_print_sec(sec):
    int_sec = int(sec)

    hour = int_sec // 3600
    left_sec = int_sec - hour * 3600
    minute = left_sec // 60
    left_sec = left_sec - minute * 60

    hour_str = ("00" + str(hour))[-2:]
    min_str = ("00" + str(minute))[-2:]
    sec_str = ("00" + str(left_sec))[-2:]

    return ":".join([hour_str, min_str, sec_str])


# 対象の秒数が切り抜き対象かチェックする時に使う関数
def is_in(tuple_list, val):
    for tup in tuple_list:
        if tup[0] <= val <= tup[1]:
            return True

    return False


# 最大値ベースで切り抜く場合はこっちを使う
def cut_by_max_rms(rms, percentile):

    is_on = False

    start = 0
    end = 0

    threshold = np.percentile(rms[0], percentile)
    cut_list = []

    # 前のフレームも切り抜き対象として選ばれていたら、切り抜き範囲を結合する
    for i, val in enumerate(rms[0]):
        if val >= threshold and is_on:
            pass
        elif val >= threshold and not is_on:
            is_on = True
            start = float(i) * 30
        elif val < threshold and is_on:
            end = float(i) * 30
            is_on = False
            cut_list.append((start, end))
        else:
            pass

    if is_on:
        cut_list.append((start, float(i + 1) * 30))

    return cut_list


# 極大点ベースの場合はこっちを使う
def cut_by_local_max_rms(rms, max_frame_num):

    cut_list = []

    order = 1
    while True:
        pts = list(scipy.signal.argrelmax(rms[0], order=order)[0])

        if len(pts) < max_frame_num:
            break

        order += 1

    for point in pts:
        cut_list.append((point * 30, (point + 1) * 30))

    return cut_list


# 切り抜き箇所の特定
# 音量を基準に切り抜きを行う
def decide_cut_frames(cut_type, voice_file):
    # 切り抜き箇所特定用に音声をロード
    # 少しでも軽くしたいのでsample rate8000で読み込んでいる
    y_voice, sr_voice = librosa.load(voice_file, sr=8000, mono=True)

    # 30秒単位で音量を調べる
    rms = librosa.feature.rms(
        y=y_voice,
        frame_length=sr_voice * 30,
        hop_length=sr_voice * 30,
        center=True,
        pad_mode="reflect",
    )

    if cut_type == "local_max":
        # 音量が極大(ピークが立っている箇所)を最大20フレーム選んで切り抜く
        cut_list = cut_by_local_max_rms(rms, 20)
    elif cut_type == "max":
        # 音量が大きい上位5%のフレームを切り抜く
        cut_list = cut_by_local_max_rms(rms, 100 - 95)

    return cut_list


# 動画の切り抜き
def cut_movie(cut_list, movie_file, output_movie_file):

    movie = cv2.VideoCapture(movie_file)
    fps = movie.get(cv2.CAP_PROP_FPS)
    height = movie.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = movie.get(cv2.CAP_PROP_FRAME_WIDTH)
    print(fps, int(width), int(height))

    # 出力時のフォーマット
    # OSによって変わると思われるので注意
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")

    # すでに存在しているとエラーになるので一回消しとく
    if os.path.exists(output_movie_file):
        os.remove(output_movie_file)

    out = cv2.VideoWriter(
        output_movie_file, fourcc, int(fps), (int(width), int(height))
    )

    for start, end in cut_list:
        i = start * fps
        movie.set(0, start * 1000)

        # startから1フレームずつ読んで、endを越えたらbreakする
        while movie.isOpened():
            sec = float(i / fps)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            ret, frame = movie.read()
            if not ret:
                break

            # 現在時刻のテキストを追加
            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(
                frame,
                pretty_print_sec(sec),
                (10, int(height * 0.9)),
                font,
                1,
                (0, 255, 0),
                2,
                cv2.LINE_AA,
            )

            if is_in(cut_list, sec):
                out.write(frame)

            i += 1
            if sec > end:
                break

    movie.release()
    out.release()


# 音声の切り抜き
def cut_audio(cut_list, voice_file, output_audio_file):

    # srはNoneを指定しないと22050になるので注意
    y_full, sr_full = librosa.load(voice_file, sr=None, mono=False)

    output_array = [[], []]
    for start, end in cut_list:
        for i in range(int(start * sr_full), int(end * sr_full) + 1):
            val_0 = y_full[0, i]
            val_1 = y_full[1, i]

            sec = float(i / sr_full)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            if is_in(cut_list, sec):
                output_array[0].append(val_0)
                output_array[1].append(val_1)

            if sec > end:
                break

    # asfortranarrayを使わないと落ちる
    librosa.output.write_wav(
        output_audio_file, np.asfortranarray(output_array), sr_full
    )


def main():
    audio_file = "full.mp3"  # 動画の音声を抽出したもの
    voice_file = "voice.wav"  # 動画から声だけを抽出したもの
    movie_file = "full.mp4"

    output_audio_file = "cut.wav"
    output_movie_file = "cut.mp4"

    cut_type = "local_max"  # 極大値ベース
    # cut_type = "max" # 最大値ベース

    cut_list = decide_cut_frames(cut_type, voice_file)
    cut_movie(cut_list, movie_file, output_movie_file)
    cut_audio(cut_list, audio_file, output_audio_file)


if __name__ == "__main__":
    main()

備考

音声と(ゲーム音声などの)背景音が分離できている前提のコードになっています。
なので、すでに配信された動画からの取れ高抽出(いわゆる切り抜き)は、音声と背景音は分離できないので、本コードの対象外となります。
ただ、取れ高部分はどの道大きな音になりがちだとは思うので、一定の効果は得られるかもしれません。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

撮影した動画から取れ高っぽいところを自動で見つけたい

やりたいこと

アプローチ

関連

ソース

備考