More than 5 years have passed since last update.

音声信号のスパース性

Posted at 2019-07-23

#音声信号のスパース性を確認してみた

音声のスパース性を用いたUnderdetermined音源分離で，音源数が3の時に重なりが少ないらしい。

１分程度の発話データがあったので，合っているかわからないけど，やってみた。

マイクの特性の影響か，2kHz以上がほぼゼロに。。。

# -*- coding: utf-8 -*-

import numpy as np
import scipy.signal as sg
import re
import soundfile as sf
import glob
from functools import reduce
import matplotlib.pyplot as plt
import itertools


# 複数話者，複数発話のWavファイル読み込み
w_h = {}
for x in glob.glob("split/*.wav"):
    ary = re.split('[\\\._]', x)
    id1, id2 = ary[1], ary[2]
    wav, fs = sf.read(x)
    w_h.setdefault(id1, []).append([id1, id2, len(wav), wav])

# 各話者の全発話の長さの算出と発話順番をランダムに並び替え
w_h_len = 0
for k, v in w_h.items():
    w_h_len = max(w_h_len, reduce(lambda total, ary: total + ary[2], v, 0))
    w_h[k] = sorted(v, key=lambda x: np.random.rand())

# 各話者の発話を連結。短い話者の発話は繰り返す。端数はゼロでパディング
w_h_con = {}
for k, v in w_h.items():
    total = 0
    con   = np.array([])
    for v_c in itertools.cycle(v):
        total += v_c[2]
        if total > w_h_len:
            # 端数はゼロでパディングして終わり
            con = np.append(con, np.zeros(w_h_len - len(con)))
            w_h_con[k] = con
            break
        con = np.append(con, v_c[3])

# 時間周波数に変換
w_h_stft = {}
for k, x in w_h_con.items():
    w_h_stft[k] = sg.stft(x, fs=fs)

# 最大振幅の1/10以上は，アクティブな時間周波数bin(=1)とする。(無効なbinは0)
f = 0
t = 0
w_h_bm = {}
for k, v in w_h_stft.items():
    f, t, Zxx = v;
    Zxx_abs = np.abs(Zxx)
    minimum = np.sort(Zxx_abs[Zxx_abs > 0])[0]
    #plt.pcolormesh(t, f, (np.abs(Zxx)))
    #plt.colorbar()
    #plt.show()
    m = np.max(np.abs(Zxx))
    bm = np.where(np.abs(Zxx) > m / 10, 1, 0)
    w_h_bm[k] = [f, t, bm]


w_h_bm_list = list(w_h_bm.values())
# 重複する時間周波数binの個数を，全組み合わせ計算
N = 3
table = np.zeros((len(f), N + 1), dtype=int)
for pair in itertools.combinations(range(len(w_h_bm_list)), N):
    w_h_bm_total = reduce(lambda bm1, bm2: bm1 + bm2, map(lambda i: w_h_bm_list[i][2], list(pair)))
    for i in range(len(f)):
        uniq, cnt = np.unique(w_h_bm_total[i], return_counts=True)
        for k, v in zip(uniq, cnt):
            table[i, k] += v

# 重複する時間周波数binの個数の累積を計算
accu = np.zeros((np.max(w_h_bm_total) + 1 + 1, len(w_h_bm_total)))
for i in range(np.max(w_h_bm_total) + 1):
    accu[i + 1] = accu[i] + table[:, i]

# max=100%となるように正規化
accu = accu / accu[-1] * 100


# 4kHz（f=8000）以下を描画
num = len(f)//2
y = np.zeros(num);
for i in range(np.max(w_h_bm_total) + 1):
    plt.fill_between(f[:num], accu[i, :num], accu[i+1, :num])

plt.legend(range(len(table)))
plt.xlabel('Frequency [Hz]')
plt.ylabel('Percentage [%]')
plt.show()

ちなみに，最大振幅の1/10以下を０にして，istftした波形。結構元の波形から変わっている。。。

描画した時のコード。Zはstftの結果。


_t, _w = sg.istft(Z[2], fs=fs)
plt.plot(_t[:8000], _w[:8000])

_t, _w = sg.istft(np.where(np.abs(Z[2]) < np.max(np.abs(Z[2])) / 10, 0, Z[2]), fs=fs)
plt.plot(_t[:8000], _w[:8000])

plt.xlabel('Time [sec]')
plt.ylabel('Signal')
plt.legend(["Original", "Masked"])
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up