今日はとにかくデータセット集め、モデルを学習するにもデータセットが必要であるが、ここで困ったことがある。
あまりにもデータセットを集めるのが大変。
楽曲ファイルをいちいち全て聴き歌唱者が変わるごとにそのその区間を記録し歌唱者IDもつける。これを20曲ほどやった段階で流石に力尽きたこんなのやってられない。
そこで思いついたのは、同じ歌を異なる歌唱者が歌っている楽曲から歌っている箇所のみを切り出しその範囲からランダムに複数の歌唱ファイルから取り出して重ねればいいじゃんと、そしたら自動的にアノテーションもできれば一石二鳥だと。
で、作ったpythonコードがこちら
ディレクトリ構造は以下のようになっている
- song_name/
|- 1/
| |- vocals.wav
| |- drums.wav
| |- piano.wav
| |- other.wav
|
|- 2/
| |- vocals.wav
| |- drums.wav
| |- piano.wav
| |- other.wav
|
|- 3/
|- vocals.wav
|- drums.wav
|- piano.wav
|- other.wav
............................
こんな感じに最大9つのディレクトリがあり、全てdemucsで抽出したもの。
いたってシンプルで、
1.inaSpeechSegmenterでこの9つのvocals.wavから歌唱区間を取り出す。
def extract_singing_parts(wav_file_path):
seg_model = Segmenter(vad_engine='smn', detect_gender=False)
seg_data = seg_model(wav_file_path)
"""noEnergy以外の区間を取得し、それを統合してリストとして出力する関数"""
# noEnergy区間を取得
noEnergy_intervals = [(start, end) for label, start, end in seg_data if label == 'noEnergy']
# noEnergy以外の区間を取得
singing_parts = []
for i in range(len(noEnergy_intervals) - 1):
singing_parts.append((noEnergy_intervals[i][1], noEnergy_intervals[i + 1][0]))
print("Singing parts: ", singing_parts)
return singing_parts
======================================================
wav_file_path = os.path.join(input_directory, base_id ,'vocals.wav')
base_dir = os.path.join(input_directory, base_id)
singing_parts = extract_singing_parts(wav_file_path)
出力はこんな感じ。
2.取得した歌唱区間ごとにさらに歌唱区間を1~2等分
num_interval = random.randint(1,2)
selected_intervals = select_intervals_in_range(singing_parts, start_time, end_time, num_interval)
3.selected_intervalsで得た歌唱区間ごとに歌唱する歌唱者の音声ファイルから切り出し、最後に合成する。
chosen_files_and_srs = random.sample(signals_and_srs, num_files)
for (chosen_file, chosen_sr), file in chosen_files_and_srs:
start_sample = librosa.time_to_samples(selected_start, sr=chosen_sr)
end_sample = librosa.time_to_samples(selected_end, sr=chosen_sr)
cutout = chosen_file[start_sample:end_sample]
sf.write('temp.wav', cutout, chosen_sr)
cutout_segment = AudioSegment.from_wav('temp.wav')
output_start_ms = selected_start * 1000 # Converting to milliseconds
output_end_ms = selected_end * 1000 # Converting to
output_length_ms = len(output)
print("Output length: ", output_length_ms)
pre_segment = output[:output_start_ms]
post_segment = output[output_end_ms:]
output = pre_segment + cutout_segment + post_segment
file_path = os.path.join(input_directory, file)
singer_id = get_singer_id(file_path)
singer_ids.append(singer_id)
taken_intervals.append((selected_start, selected_end))
interval_and_ids.append((selected_start, selected_end, singer_ids))
counter += num_interval
output.export(output_wav, format='wav')
print("Taken intervals: ", taken_intervals)
print("Interval and ids: ", interval_and_ids)
overlay_audios(base_dir, output_wav)
4.切り出した区間とその区間で歌唱している歌唱者のIDを記録しテキストファイルに書き込む
for interval in singing_parts:
start_time, end_time = interval
if not any([(taken_start <= start_time and taken_end >= end_time) for taken_start, taken_end in taken_intervals]):
interval_and_ids.append((start_time, end_time, [base_id]))
interval_and_ids.sort(key=lambda x: x[0])
with open(output_text, 'w') as f:
for interval in interval_and_ids:
start_time, end_time, singer_ids = interval
singer_ids_str = ','.join(str(singer_id) for singer_id in singer_ids)
if (start_time, end_time) not in taken_intervals:
singer_ids_str = '1'
line = f'{start_time:.3f} {end_time:.3f} {singer_ids_str}\n'
f.write(line)
こんな感じのステップを踏んでダイアリゼーションのためのデータセットを作ってみた、でまぁテキストファイルにはこんな感じに書き込まれてて聞くとちゃんと一致しているのでおそらくできているでしょう。
10.260 32.940 1
34.000 47.360 1
49.680 77.220 1
77.800 82.620 5
83.820 86.040 1,6
83.820 88.260 1
86.040 88.260 1,6
99.480 136.300 1
138.400 152.410 8,5,6
138.400 166.420 1
152.410 166.420 8,5,6
167.420 185.280 1
212.500 226.380 5
212.500 240.260 1
226.380 240.260 5
240.680 245.240 1
246.720 255.140 5,8
255.840 260.600 1
262.360 269.840 1
今回は同一の曲を異なる歌唱者がソロで歌っている音声があったから直接使用したが、今だったらRVCとかで入手するのもよさそうですね。
最後に全体コード。
import random
import numpy as np
import librosa
from pydub import AudioSegment
import os
from inaSpeechSegmenter import Segmenter
import soundfile as sf
def select_intervals_in_range(intervals, start_time, end_time, num_intervals):
valid_intervals = [(start, end) for start, end in intervals if start>= start_time
and end <= end_time]
print("Valid intervals: ", valid_intervals)
selected_intervals = []
if len(valid_intervals) > 0:
for start, end in valid_intervals:
interval_duration = end - start
split_duration = interval_duration / num_intervals
split_starts = [start + i*split_duration for i in range(num_intervals)]
split_intervals = [(split_start, split_start+split_duration) for split_start in split_starts]
selected_intervals.extend(split_intervals)
return selected_intervals
def get_singer_id(directory_path):
directory_name = os.path.basename(directory_path)
return int(directory_name)
def is_non_overlapping(interval, taken_intervals):
start, end = interval
for taken_start, taken_end in taken_intervals:
if start < taken_end and end > taken_start:
return False
return True
def extract_singing_parts(wav_file_path):
seg_model = Segmenter(vad_engine='smn', detect_gender=False)
seg_data = seg_model(wav_file_path)
"""noEnergy以外の区間を取得し、それを統合してリストとして出力する関数"""
# noEnergy区間を取得
noEnergy_intervals = [(start, end) for label, start, end in seg_data if label == 'noEnergy']
# noEnergy以外の区間を取得
singing_parts = []
for i in range(len(noEnergy_intervals) - 1):
singing_parts.append((noEnergy_intervals[i][1], noEnergy_intervals[i + 1][0]))
print("Singing parts: ", singing_parts)
return singing_parts
def overlay_audios(base_dir, output_wav):
combined = AudioSegment.from_wav(output_wav)
for file in os.listdir(base_dir):
if file == 'vocals.wav':
continue
audio = AudioSegment.from_wav(os.path.join(base_dir, file))
combined = combined.overlay(audio)
output_directory = os.path.dirname(output_wav)
combined.export(os.path.join(output_directory, 'preseparated.wav'), format='wav')
def generate_audio_text(input_directory, output_wav, output_text):
os.makedirs(os.path.dirname(output_wav), exist_ok=True)
os.makedirs(os.path.dirname(output_text), exist_ok=True)
max_parts = 10
folder_names = os.listdir(input_directory)
signals_and_srs = [(librosa.load(os.path.join(input_directory, file, "vocals.wav")), file) for file in folder_names]
base_id = random.choice(folder_names)
wav_file_path = os.path.join(input_directory, base_id ,'vocals.wav')
base_dir = os.path.join(input_directory, base_id)
singing_parts = extract_singing_parts(wav_file_path)
first_file, first_sr = signals_and_srs[0][0]
print('first_sr_type:', type(first_sr), 'first_sr:', first_sr)
output = AudioSegment.from_wav(os.path.join(input_directory, base_id, "vocals.wav"))
counter = 0
singing_parts_copy = singing_parts.copy()
taken_intervals = []
interval_and_ids = []
num_files = 1
while counter < max_parts:
start_time, end_time = random.choice(singing_parts_copy)
num_interval = random.randint(1,2)
selected_intervals = select_intervals_in_range(singing_parts, start_time, end_time, num_interval)
num_files = (num_files % 3) + 1
chosen_files_and_srs = random.sample(signals_and_srs, num_files)
for selected_interval in selected_intervals:
singer_ids = []
selected_start, selected_end = selected_interval
selected_start, selected_end = float(selected_start), float(selected_end)
if not is_non_overlapping((selected_start, selected_end), taken_intervals):
continue
for (chosen_file, chosen_sr), file in chosen_files_and_srs:
start_sample = librosa.time_to_samples(selected_start, sr=chosen_sr)
end_sample = librosa.time_to_samples(selected_end, sr=chosen_sr)
cutout = chosen_file[start_sample:end_sample]
sf.write('temp.wav', cutout, chosen_sr)
cutout_segment = AudioSegment.from_wav('temp.wav')
output_start_ms = selected_start * 1000 # Converting to milliseconds
output_end_ms = selected_end * 1000 # Converting to
output_length_ms = len(output)
print("Output length: ", output_length_ms)
pre_segment = output[:output_start_ms]
post_segment = output[output_end_ms:]
output = pre_segment + cutout_segment + post_segment
file_path = os.path.join(input_directory, file)
singer_id = get_singer_id(file_path)
singer_ids.append(singer_id)
taken_intervals.append((selected_start, selected_end))
interval_and_ids.append((selected_start, selected_end, singer_ids))
counter += num_interval
output.export(output_wav, format='wav')
print("Taken intervals: ", taken_intervals)
print("Interval and ids: ", interval_and_ids)
overlay_audios(base_dir, output_wav)
for interval in singing_parts:
start_time, end_time = interval
if not any([(taken_start <= start_time and taken_end >= end_time) for taken_start, taken_end in taken_intervals]):
interval_and_ids.append((start_time, end_time, [base_id]))
interval_and_ids.sort(key=lambda x: x[0])
with open(output_text, 'w') as f:
for interval in interval_and_ids:
start_time, end_time, singer_ids = interval
singer_ids_str = ','.join(str(singer_id) for singer_id in singer_ids)
if (start_time, end_time) not in taken_intervals:
singer_ids_str = '1'
line = f'{start_time:.3f} {end_time:.3f} {singer_ids_str}\n'
f.write(line)
audio_folders = {任意のフォルダ}
for audio_folder_name in os.listdir(audio_folders):
audio_folder = os.path.join(audio_folders, audio_folder_name, 'separated')
output_wav = os.path.join(audio_folders, audio_folder_name, 'output', 'separated.wav')
output_text = os.path.join(audio_folders, audio_folder_name, 'output', 'annotation.txt')
generate_audio_text(audio_folder, output_wav, output_text)