More than 1 year has passed since last update.

「Python で学ぶ音声合成」のメルスペクトログラムの合成を transformer で行うように改修した。

Last updated at 2023-07-30Posted at 2023-07-24

はじめに

以前に第一報

で報告しましたが、「Python で学ぶ音声合成」（教科書）の Tacotron2 でメルスペクトログラムを合成するプログラムを、LSTM から Transformer に改修したので、その後を、報告させていただきます。報告する内容は、学習の概略。合成した音声。合成したメルスペクトログラム。Stop Token では合成音声の終端が検出できず工夫したこと。プログラムです。

学習の概略。

学習は、Python で学ぶ音声合成の Tacotron2 の学習に準拠して行った。学習データは、JSUT ver 1.1 の 5000 発話のうち、4700 がtrain、200が development、100 が test です。Epoch は 2900 で、batch_size = 32 です。steps_per_epoch は train が 147で、development が 7 です。train についての loss と steps のグラフを掲載します。

また、development についての loss と steps のグラムも掲載します。

合成した音声

次の2種類について、予測合成した音声を掲載します。

BASIC5000_4999.wav (元の音声）

BASIC5000_5000.wav (元の音声）

合成した音声、4999

合成した音声、5000

ある程度の品質で合成できていると考えます。ボコーダーには、「Python で学ぶ音声合成」の wavenet のボコーダーを使いました。

合成したメルスペクトログラム

4999 番について合成したメルスペクトログラムの図

図の一番上が、元の音声をメルスペクトログラムに変換したものです。図の二番目が、韻律記号付き音素と 1だけ後ろにずらしたスペクトログラムを入力して teacher forcing で予測したメルスペクトログラムです。図の三番目が、韻律記号付き音素を入力して合成したメルスペクトログラムです。一番目と二番目の図は、2.5秒くらいまでですが、三番目の図は、3.5 秒までとなっています。これは、余分な音声まで予測してしまうことを示しています。

5000 番について合成したメルスペクトログラムの図

一番目の図と二番目の図は、2.5秒程度までですが、三番目の図は 3.2 秒程度まであります。これも、余分な音声を予測しています。

5000番の stop token

5000番の stop token の sigmoid を見てみると、Time[frame] が 250 frame まで、0 付近です。ここで、 1frame = 0.0125 秒なので、250 frame は、3.125 秒です。

余分な音声の手前で終了

余分な音声の手前で終了するために、メルスペクトログラム out_feats_fine の dim = 1 について sum

out_feats_fine5 = torch.sum( out_feats_fine, dim = 1 )

をグラフ化します。

4999番のグラフ

time frame = 180 くらい。2.25秒程度で -50 以下が 10 frame 以上続いています。

5000番のグラフ

time frame = 180 くらい。2.25秒程度で -50 以下が 10 frame 以上続いています。

これらより、out_feats_fine5 が -50 以下が10frame 以上続いたら終了としました。

使用したプログラム

プログラムは、「python で学ぶ音声合成」の拡張として動かすように作りました。この教科書でインストールする ttslearn-master フォルダの中で考えます。recipes/tacotron フォルダを実行して、recipes/transtron フォルダとしてコピーします。 notebooks フォルダに以下に示すプログラムを ch96_learning_transtron.ipynb として置き、実行します。

ch96_learning_transtron.ipynb の中身。

ライブラリーの読み込みなど

import torch
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
import numpy as np
torch.set_default_tensor_type('torch.cuda.FloatTensor')

表示の設定など

import japanize_matplotlib

# warning表示off
import warnings
warnings.simplefilter('ignore')

# デフォルトフォントサイズ変更
plt.rcParams['font.size'] = 14

# デフォルトグラフサイズ変更
plt.rcParams['figure.figsize'] = (6,6)

# デフォルトで方眼表示ON
plt.rcParams['axes.grid'] = True

# numpyの表示桁数設定
np.set_printoptions(suppress=True, precision=5)

current ディレクトリの変更

import os
from ttslearn.env import is_colab
from os.path import exists

# recipeのディレクトリに移動
cwd = os.getcwd()
if cwd.endswith("notebooks"):
    os.chdir("../recipes/transtron/")
elif is_colab():
    os.chdir("recipes/transtron/")

LayerNorm と Linear クラス

class LayerNorm(nn.LayerNorm):
    def forward(self, x):
        return super().forward(x.float()).type(x.dtype)


class Linear(nn.Linear):
    def forward(self, x):
        return F.linear(
            x,
            self.weight.to(x.dtype),
            None if self.bias is None else self.bias.to(x.dtype),
        )

MultiHeadAttention クラス

class MultiHeadAttention(nn.Module):
    def __init__(self, n_state: int, n_head: int):
        super().__init__()
        self.n_head = n_head
        self.query = Linear(n_state, n_state)
        self.key = Linear(n_state, n_state, bias=False)
        self.value = Linear(n_state, n_state)
        self.out = Linear(n_state, n_state)

    def forward(
        self,
        x,
        xa = None,
        mask = None
    ):
        #print( " size of x:{}".format( x.size() ))
        q = self.query(x)

        if xa is None:
            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
            # otherwise, perform key/value projections for self- or cross-attention as usual.
            k = self.key(x if xa is None else xa)
            v = self.value(x if xa is None else xa)
        else:
            k = self.key( xa )
            v = self.value( xa )

        wv, qk, w = self.qkv_attention(q, k, v, mask)
        return self.out(wv), qk, w

    def qkv_attention(
        self, q, k, v, mask = None
    ):
        n_batch, n_ctx, n_state = q.shape
        scale = (n_state // self.n_head) ** -0.25
        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)

        qk = q @ k
        if mask is not None:
            qk = qk + mask[:n_ctx, :n_ctx]
        qk = qk.float()

        w = F.softmax(qk, dim=-1).to(q.dtype)
        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach(), w

ResidualAttentionBlock クラス

class ResidualAttentionBlock(nn.Module):
    def __init__(self, n_state: int, n_head: int, ffn_dim: int, cross_attention: bool = False):
        super().__init__()

        self.attn = MultiHeadAttention(n_state, n_head)
        self.attn_ln = LayerNorm(n_state)

        self.cross_attn = MultiHeadAttention(n_state, n_head ) if cross_attention else None
        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None

        #n_mlp = n_state * 4
        n_mlp = ffn_dim
        self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.ReLU(), nn.Linear(n_mlp, n_state))
        self.mlp_ln = LayerNorm(n_state)

    def forward(
        self,
        x,
        xa,
        mask = None
    ):
        #x = x + self.attn(self.attn_ln(x), self.attn_ln(x), self.attn_ln(x), attn_mask=mask)[0]
        #x = x + self.attn(self.attn_ln(x), mask=mask)[0]
        xx, _, attn1 = self.attn(self.attn_ln(x), mask=mask)
        x = x + xx
        if self.cross_attn:
            #x = x + self.cross_attn(self.cross_attn_ln(x), xa, mask = None)[0]
            xx, _, attn2 = self.cross_attn(self.cross_attn_ln(x), xa, mask = None)
            x = x + xx
        else:
            attn2 = []
        x = x + self.mlp(self.mlp_ln(x))
        return x, attn1, attn2

Encoder クラス

class Encoder(nn.Module):
    def __init__(
        self,
        num_vocab=51,
        embed_dim=512,
        conv_layers=3,
        conv_channels=512,
        conv_kernel_size=5,
        enc_hidden_dim = 512,
        num_enc_layers = 8,
        num_heads = 4,
        enc_dropout_rate = 0.1,
        conv_dropout_rate = 0.1,
        input_maxlen = 300,
        ffn_dim = 2048
    ):
        super(Encoder, self).__init__()
        # 文字の埋め込み表現
        self.embed = nn.Embedding(num_vocab, embed_dim, padding_idx=0)
        self.pos_emb = nn.Embedding(input_maxlen, embed_dim)
        # 1 次元畳み込みの重ね合わせ：局所的な時間依存関係のモデル化
        convs = nn.ModuleList()
        for layer in range(conv_layers):
            in_channels = embed_dim if layer == 0 else conv_channels
            out_channels = enc_hidden_dim if layer == conv_layers - 1 else conv_channels
            #print( " in_channels:{}".format( in_channels ))
            #print( " out_channels:{}".format( out_channels ))
            convs += [
                nn.Conv1d(
                    in_channels,
                    out_channels,
                    conv_kernel_size,
                    padding=(conv_kernel_size - 1) // 2,
                    bias=False,  # この bias は不要です
                ),
                nn.BatchNorm1d(out_channels),
                nn.ReLU(),
                nn.Dropout(conv_dropout_rate),
            ]
        self.convs = nn.Sequential(*convs)
        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
            [ResidualAttentionBlock(embed_dim, num_heads, ffn_dim,  cross_attention = False) for _ in range(num_enc_layers)]
        )
        self.input_maxlen = input_maxlen

        self.dropout = nn.Dropout(p=enc_dropout_rate)
        self.num_enc_layers = num_enc_layers
        
    def forward(self, x, in_lens ):
        emb = self.embed(x)
        # 1 次元畳み込みと embedding では、入力のサイズ が異なるので注意
        #print( "size of emb:{}".format( emb.size() ))
        out = self.convs(emb.transpose(1, 2)).transpose(1, 2)
        #print( "encoder out:{}".format( out ))
        maxlen = out.size()[1]
        #print( "size of out:{}".format( out.size()))
        positions = torch.range(start=0, end=self.input_maxlen - 1, step=1).to(torch.long)
        positions = self.pos_emb(positions)[:maxlen,:]
        #print( "size of positions:{}".format( positions.size()))
        x = out + positions
        #print( "0 encoder x:{}".format( x ))
        x = self.dropout( x )
        #print( "1 encoder x:{}".format( x ))
        #for i in range(self.num_enc_layers):
        #    x = self.enc_layers[i](x )
        #print( "2 x:{}".format( x ))
        for block in self.blocks:
            x, attn1, attn2 = block(x, x, mask = None)
        
        return x  # (batch_size, input_seq_len, d_model)

Prenet クラス

class Prenet(nn.Module):
    def __init__(self, in_dim, dec_hidden_dim, layers=8, hidden_dim=512, dropout=0.5):
        super().__init__()
        self.dropout = dropout
        prenet = nn.ModuleList()
        for layer in range(layers):
            in_dims = in_dim if layer == 0 else hidden_dim
            out_dims = dec_hidden_dim if layer == layers - 1 else hidden_dim
            prenet += [
                nn.Linear(in_dims, out_dims ),
                nn.ReLU(),
                nn.Dropout(dropout) # added by Toshio Uchiyama
            ]
        self.prenet = nn.Sequential(*prenet)        
        
    def forward(self, x):
        for layer in self.prenet:
            # 学習時、推論時の両方で Dropout を適用します
            #x = F.dropout(layer(x), self.dropout, training=True)
            x = layer(x) # original is above
        return x

Decoder クラス

class Decoder(nn.Module):
    def __init__(
        self,
        decoder_hidden_dim=512,
        out_dim=80,
        layers=8,
        prenet_layers=2,
        prenet_hidden_dim=512,
        prenet_dropout=0.5,
        ffn_dim=2048,
        dropout_rate = 0.1,
        dec_input_maxlen=3000,
        num_heads = 4
    ):
        super().__init__()
        self.out_dim = out_dim
        self.num_heads = num_heads

        # Prenet
        self.prenet = Prenet(out_dim, decoder_hidden_dim,  prenet_layers, prenet_hidden_dim, prenet_dropout)
        #self.prenet = nn.Linear( out_dim, prenet_hidden_dim )
        #self.prenet.eval()

        #  DecoderLayer
        #self.dec_layers = [DecoderLayer(decoder_hidden_dim, num_heads, ffn_dim, dropout_rate) 
        #               for _ in range(layers)]
        
        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
            [ResidualAttentionBlock(decoder_hidden_dim, num_heads, ffn_dim, cross_attention=True) for _ in range(layers)]
        )
        #self.blocks.eval()
        
        #self.dropout = nn.Dropout(dropout_rate)
        self.pos_emb = nn.Embedding(dec_input_maxlen, decoder_hidden_dim)
        #self.pos_emb.eval()
        
 
        # 出力への projection 層
        proj_in_dim = decoder_hidden_dim
        #print( "proj_in_dim:{}".format( proj_in_dim ))
        #print( "out_dim:{}".format( out_dim ))
        self.feat_out = nn.Linear(proj_in_dim, out_dim, bias=False)
        #self.feat_out.eval()
        self.prob_out = nn.Linear(proj_in_dim, 1)
        #self.prob_out.eval()
        
        self.dec_input_maxlen = dec_input_maxlen
        self.layers = layers
        hidden_dim = decoder_hidden_dim


    def forward(self, encoder_outs, in_lens, decoder_targets=None):

        # Pre-Net
        #prenet_out = self.prenet(prev_out)
        #print( " size of decoder_targets:{}".format( decoder_targets.size()))
        #print( "encoder_outs:{}".format(encoder_outs) )
        #print( "decoder_targets:{}".format( decoder_targets))
        prenet_out = self.prenet(decoder_targets)
        #print( "prenet_out:{}".format( prenet_out))
        maxlen = prenet_out.size()[1]
        #print( "size of prenet_out:{}".format( prenet_out.size()))
        positions = torch.range(start=0, end=self.dec_input_maxlen - 1, step=1).to(torch.long)
        positions = self.pos_emb(positions)[:maxlen,:]
        #print( "positions:{}".format( positions))
        #print( "size of positions:{}".format( positions.size()))
        x = prenet_out + positions
        #print( "0 x:{}".format( x ))
        
        attention_weights = {}
        
        for i, block in enumerate( self.blocks ):
            T = x.size()[1]
            #T = 1
            look_ahead_mask = torch.empty(T, T).fill_(-np.inf).triu_(1)
            x, attn1, attn2 = block(x, encoder_outs, mask=look_ahead_mask)
            attention_weights['{}'.format(2*i)] = attn1
            attention_weights['{}'.format(2*i + 1)] = attn2     
            
        #print( "size of x:{}".format( x.size()))
        outs = self.feat_out(x)
        #print( "outs:{}".format(outs))
        outs = torch.permute(outs, (0, 2, 1))
        logits = torch.squeeze( self.prob_out(x), axis=2 )            
        
        return outs, logits, attention_weights

Postnet クラス

class Postnet(nn.Module):
    def __init__(
        self,
        in_dim=80,
        layers=5,
        channels=512,
        kernel_size=5,
        dropout=0.5,
    ):
        super().__init__()
        postnet = nn.ModuleList()
        for layer in range(layers):
            in_channels = in_dim if layer == 0 else channels
            out_channels = in_dim if layer == layers - 1 else channels
            postnet += [
                nn.Conv1d(
                    in_channels,
                    out_channels,
                    kernel_size,
                    stride=1,
                    padding=(kernel_size - 1) // 2,
                    bias=False,
                ),
                nn.BatchNorm1d(out_channels),
            ]
            if layer != layers - 1:
                postnet += [nn.Tanh()]
            postnet += [nn.Dropout(dropout)]
        self.postnet = nn.Sequential(*postnet)

    def forward(self, xs):
        return self.postnet(xs)

Transtron クラス

class Transtron(nn.Module):
    def __init__(self,
            num_vocab=52,
            embed_dim=512,
            conv_layers=3,
            conv_channels=512,
            conv_kernel_size=5,
            enc_hidden_dim = 512,
            num_enc_layers = 8,
            enc_num_heads = 4,
            enc_dropout_rate = 0.1,
            conv_dropout_rate = 0.1,
            enc_input_maxlen = 300,
            enc_ffn_dim = 2048,              
            decoder_hidden_dim=512,
            out_dim=80,
            num_dec_layers=8,
            prenet_layers=2,
            prenet_hidden_dim=512,
            prenet_dropout=0.5,
            dec_ffn_dim=2048,
            dec_dropout_rate = 0.1,
            dec_input_maxlen=3000,
            dec_num_heads = 4,                
            postnet_in_dim=80,
            postnet_layers=5,
            postnet_channels=512,
            postnet_kernel_size=5,
            postnet_dropout=0.5
        ):
        super().__init__()
        self.encoder = Encoder(
            num_vocab,
            embed_dim,
            conv_layers,
            conv_channels,
            conv_kernel_size,
            enc_hidden_dim,
            num_enc_layers,
            enc_num_heads,
            enc_dropout_rate,
            conv_dropout_rate,
            enc_input_maxlen,
            enc_ffn_dim 
        )
        self.decoder = Decoder(
            decoder_hidden_dim,
            out_dim,
            num_dec_layers,
            prenet_layers,
            prenet_hidden_dim,
            prenet_dropout,
            dec_ffn_dim,
            dec_dropout_rate,
            dec_input_maxlen,
            dec_num_heads       
        )
        self.postnet = Postnet(
            postnet_in_dim,
            postnet_layers,
            postnet_channels,
            postnet_kernel_size,
            postnet_dropout
        )

    def forward(self, seq, in_lens, decoder_targets):
        # エンコーダによるテキストに潜在する表現の獲得
        encoder_outs = self.encoder(seq, in_lens)

        # デコーダによるメルスペクトログラム、stop token の予測
        outs, logits, att_ws = self.decoder(encoder_outs, in_lens, decoder_targets)

        # Post-Net によるメルスペクトログラムの残差の予測
        outs_fine = outs + self.postnet(outs)

        # (B, C, T) -> (B, T, C)
        outs = outs.transpose(2, 1)
        outs_fine = outs_fine.transpose(2, 1)

        return outs, outs_fine, logits, att_ws
    '''
    def inference(self, seq):
        seq = seq.unsqueeze(0) if len(seq.shape) == 1 else seq
        in_lens = torch.tensor([seq.shape[-1]], dtype=torch.long, device=seq.device)

        return self.forward(seq, in_lens, None)
    
    def inference(self, in_feats ):
        """Performs inference over one batch of inputs using greedy decoding."""
        in_feats = torch.unsqueeze( in_feats, axis = 0 )
        bs = in_feats.size()[0]
        in_lens = []
        for feats in ( in_feats):
            in_lens.append( len( feats ))
        # エンコーダによるテキストに潜在する表現の獲得
        encoder_outs = model.encoder(in_feats, in_lens)
        decoder_targets_maxlen = in_lens[0] * 10
        #dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        decoder_targets = encoder_outs.new_zeros((encoder_outs.size()[0], 1, 80))
        #decoder_targets = None
        #dec_logits = []
        for i in range(decoder_targets_maxlen ):
            print( "i:{}".format( i ))
            # デコーダによるメルスペクトログラム、stop token の予測
            outs, logits, att_ws = model.decoder(encoder_outs, in_lens, decoder_targets)
            print( "torch.sigmoid(logits[0, -1]):{}".format(torch.sigmoid(logits[0, -1])))
            if i > 40 and torch.sigmoid(logits[0, -1]) >= 0.5:
                break
            #print( "0 size of outs:{}".format( outs.size() ))
            outs = torch.permute(outs, (0, 2, 1))
            outs2 = torch.unsqueeze( outs[:,-1,:], axis = 1 )
            #print( "size of outs2:{}".format( outs2.size()))
            #print( "1 size of outs:{}".format( outs.size() ))
            #print( "1 size of decoder_targets:{}".format( decoder_targets.size()))
            decoder_targets = torch.cat( (decoder_targets, outs2), axis = 1 )
            #print( "2 size of decoder_targets:{}".format( decoder_targets.size()))
            #logits = self.classifier(dec_out)
            #logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            #last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            #decoder_targets = torch.concat([decoder_targets, outs], axis=-1)
        # Post-Net によるメルスペクトログラムの残差の予測
        outs = torch.permute(outs, (0, 2, 1))
        outs_fine = outs + model.postnet(outs)

        # (B, C, T) -> (B, T, C)
        outs = outs.transpose(2, 1)
        outs_fine = outs_fine.transpose(2, 1)
    
        #print( "size of outs_fine:{}".format( outs_fine.size() ))
    
        return outs[0], outs_fine[0], logits[0], att_ws  
    '''

model 定義

model = Transtron(
    num_vocab=52,
    embed_dim=512,
    conv_layers=3,
    conv_channels=512,
    conv_kernel_size=5,
    enc_hidden_dim = 512,
    num_enc_layers = 8,
    enc_num_heads = 4,
    enc_dropout_rate = 0.1,
    conv_dropout_rate = 0.1,
    enc_input_maxlen = 300,
    enc_ffn_dim = 2048,          
    decoder_hidden_dim=512,
    out_dim=80,
    num_dec_layers=8,
    prenet_layers=2,
    prenet_hidden_dim=512,
    prenet_dropout=0.5,
    dec_ffn_dim=2048,
    dec_dropout_rate = 0.1,
    dec_input_maxlen=3000,
    dec_num_heads = 4,                
    postnet_in_dim=80,
    postnet_layers=5,
    postnet_channels=512,
    postnet_kernel_size=5,
    postnet_dropout=0.5
)

学習に必要な関数

#学習で必要な関数
def ensure_divisible_by(feats, N):
    if N == 1:
        return feats
    mod = len(feats) % N
    if mod != 0:
        feats = feats[: len(feats) - mod]
    return feats

#学習で必要な関数
from ttslearn.util import pad_1d, pad_2d

def collate_fn_transtron(batch):
    xs = [x[0] for x in batch]
    ys = [ensure_divisible_by(x[1], 1) for x in batch]
    in_lens = [len(x) for x in xs]
    out_lens = [len(y) for y in ys]
    in_max_len = max(in_lens)
    out_max_len = max(out_lens)
    x_batch = torch.stack([torch.from_numpy(pad_1d(x, in_max_len)) for x in xs])
    y_batch = torch.stack([torch.from_numpy(pad_2d(y, out_max_len)) for y in ys])
    in_lens = torch.tensor(in_lens, dtype=torch.long)
    out_lens = torch.tensor(out_lens, dtype=torch.long)
    stop_flags = torch.zeros(y_batch.shape[0], y_batch.shape[1])
    for idx, out_len in enumerate(out_lens):
        stop_flags[idx, out_len - 1 :] = 1.0
    return x_batch, in_lens, y_batch, out_lens, stop_flags

#学習で必要なミニバッチデータ
from pathlib import Path
from ttslearn.train_util import Dataset
from functools import partial

in_paths_dev = sorted(Path("./dump/jsut_sr16000/norm/dev/in_tacotron/").glob("*.npy"))
in_paths = sorted(Path("./dump/jsut_sr16000/norm/train/in_tacotron/").glob("*.npy"))
#in_paths = sorted(Path("./dump/jsut_sr16000/norm/dev/in_tacotron/").glob("*.npy"))
#print( "in_paths:{}".format( in_paths ))
out_paths_dev = sorted(Path("./dump/jsut_sr16000/norm/dev/out_tacotron/").glob("*.npy"))
out_paths = sorted(Path("./dump/jsut_sr16000/norm/train/out_tacotron/").glob("*.npy"))
#out_paths = sorted(Path("./dump/jsut_sr16000/norm/dev/out_tacotron/").glob("*.npy"))


dataset = Dataset(in_paths, out_paths)
dataset_dev = Dataset(in_paths_dev, out_paths_dev)
#print( " len of dataset:{}".format( len( dataset )))
collate_fn = partial(collate_fn_transtron)
#data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, collate_fn=collate_fn, num_workers=0)
#data_loader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=8, collate_fn=collate_fn, num_workers=0)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, collate_fn=collate_fn, num_workers=0)
data_loader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=32, collate_fn=collate_fn, num_workers=0)


in_feats, in_lens, out_feats, out_lens, stop_flags = next(iter(data_loader))
print("入力特徴量のサイズ:", tuple(in_feats.shape))
print("出力特徴量のサイズ:", tuple(out_feats.shape))
print("stop flags のサイズ:", tuple(stop_flags.shape))

#学習前にミニバチデータの可視化（教師データ,out_feats)

import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from ttslearn.notebook import get_cmap, init_plot_style, savefig
cmap = get_cmap()
init_plot_style()

sr = 16000

fig, ax = plt.subplots(len(out_feats), 1, figsize=(8,40), sharex=True, sharey=True)
for n in range(len(in_feats)):
    x = out_feats[n].data.numpy()
    hop_length = int(sr * 0.0125)
    mesh = librosa.display.specshow(x.T, sr=sr, x_axis="time", y_axis="frames", hop_length=hop_length, cmap=cmap, ax=ax[n])
    fig.colorbar(mesh, ax=ax[n])
    mesh.set_clim(-4, 4)
    # あとで付け直すので、ここではラベルを削除します
    ax[n].set_xlabel("")
    
ax[-1].set_xlabel("Time [sec]")
for a in ax:
    a.set_ylabel("Mel channel")

plt.tight_layout()
savefig("fig/e2etts_impl_minibatch")

#device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to( device )

#学習の前準備

from torch import optim

# lr は学習率を表します
optimizer = optim.Adam(model.parameters(), lr=0.0001)
#optimizer = optim.Adam(model.parameters(), lr=0.0001, eps=1e-9, amsgrad=True)
#optimizer = optim.Adam(model.parameters(), lr=0.001, eps=1e-9, amsgrad=True)
#optimizer = optim.RMSprop(model.parameters(), lr=0.0001, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)


# gamma は学習率の減衰係数を表します
#lr_scheduler = optim.lr_scheduler.StepLR(optimizer, gamma=0.5, step_size=500000)

```python
import gc 

gc.collect()

for param in model.parameters():
    param.requires_grad = True

学習

#学習

from ttslearn.util import make_non_pad_mask
from ttslearn.tacotron import Tacotron2TTS
from tqdm.notebook import tqdm
from IPython.display import Audio
import pandas as pd

history = np.zeros((0, 7))
history_dev = np.zeros((0, 6))

num_epochs = 10000
#num_epochs = 30
it_train = 0
it_dev = 0
for epoch in range( num_epochs ):
    
    model.train()
    total_decoder_out_loss = 0
    total_postnet_out_loss = 0
    total_stop_token_loss = 0
    total_loss = 0
    count = 0
    # DataLoader を用いたミニバッチの作成: ミニバッチ毎に処理する
    phar = tqdm( range( len(data_loader) ), desc='train' )
    Iter_train = iter(data_loader)
    for i in phar:
    #for in_feats, in_lens, out_feats, out_lens, stop_flags in tqdm(data_loader):
        in_feats, in_lens, out_feats, out_lens, stop_flags = next(Iter_train)
        in_feats = in_feats.to(device)
        in_lens = in_lens.to(device)
        out_feats = out_feats.to(device)
        out_lens = out_lens.to(device)
        stop_flags = stop_flags.to(device)
        in_lens, indices = torch.sort(in_lens, dim=0, descending=True)
        in_feats, out_feats, out_lens = in_feats[indices], out_feats[indices], out_lens[indices]
        out_feats0 = torch.zeros_like( out_feats )
        out_feats0[:,1:,:] = out_feats[:,:-1,:]
    
        #count += len( in_feats )
        count += 1
    
        # 順伝搬の計算
        #print( "size of in_feats:{}".format( in_feats.size()))
        #print( "size of in_lens:{}".format( in_lens.size()))
        #print( "in_lens:{}".format( in_lens ))
        #print( "size of out_feats:{}".format( out_feats.size ))
        #out_feats2 = out_feats[:,:-1,:]
    
        #outs, outs_fine, logits, _ = model(in_feats, in_lens, out_feats)
        outs, outs_fine, logits, _ = model(in_feats, in_lens, out_feats0)
        #print( "size of out_feats:{}".format( out_feats.size()))
        #out_feats2 = torch.zeros_like( out_feats )
        #out_feats2[:,:-1,:] = out_feats[:,1:,:]
        #print( "size of out_feats2:{}".format( out_feats2.size()))
        #stop_flags2 = torch.ones_like( stop_flags )
        #stop_flags2[:,:-1] = stop_flags[:,1:] 
        
        # ゼロパディグした部分を損失関数のの計算から除外するためにマスクを適用します
        # Mask (B x T x 1)
        mask = make_non_pad_mask(out_lens).unsqueeze(-1)
        #print( out_feats2.size())
        out_feats = out_feats.masked_select(mask)
        #out_feats2 = out_feats2.masked_select(mask)
        outs = outs.masked_select(mask)
        outs_fine = outs_fine.masked_select(mask)
        #print( "size of stop_flags:{}".format( stop_flags.size()))
        #print( "stop_flags[0][-1]:{}".format( stop_flags[0][-1]))
        stop_flags = stop_flags.masked_select(mask.squeeze(-1))
        #stop_flags2 = stop_flags2.masked_select(mask.squeeze(-1))
        logits = logits.masked_select(mask.squeeze(-1))
        #print( out_feats.size())
        
        # 損失の計算
        #decoder_out_loss = nn.MSELoss(reduction='mean')(outs, out_feats2)
        #decoder_out_loss = nn.MSELoss()(outs, out_feats2)
        #postnet_out_loss = nn.MSELoss(reduction='mean')(outs_fine, out_feats2)
        #postnet_out_loss = nn.MSELoss()(outs_fine, out_feats2)
        decoder_out_loss = nn.MSELoss()(outs, out_feats)
        postnet_out_loss = nn.MSELoss()(outs_fine, out_feats) 
        #print( "logits", logits )
        #print( "stop_flags", stop_flags)
        stop_token_loss = nn.BCEWithLogitsLoss()(logits, stop_flags)
        #stop_token_loss = nn.BCEWithLogitsLoss(reduction='mean')(logits, stop_flags2)
        #stop_token_loss = nn.BCEWithLogitsLoss()(logits, stop_flags2)
        
        # 損失の合計
        loss = decoder_out_loss + postnet_out_loss + stop_token_loss
        
        total_loss += loss.item()
        #print( "loss:{}".format(total_loss))
        total_decoder_out_loss += decoder_out_loss.item()
        #print( "decoder_out_loss:{}".format(total_decoder_out_loss))
        total_postnet_out_loss += postnet_out_loss.item()
        #print( "postnet_out_loss:{}".format(total_postnet_out_loss))
        total_stop_token_loss += stop_token_loss.item()
        #print( "stop_token_loss:{}".format(total_stop_token_loss))

        
        # 損失の値を出力
        it_train += 1
        # optimizer に蓄積された勾配をリセット
        optimizer.zero_grad()
        # 誤差の逆伝播
        loss.backward()
        # gradient clipping
        #a = nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2)
        #a = nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
        #print( "a:{}".format(a))
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2)
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5, norm_type=2)
        #nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
        # パラメータの更新
        optimizer.step()
        # 学習率スケジューラの更新
        current_lr = optimizer.param_groups[0]["lr"]
        #lr_scheduler.step()
        
        avg_loss = total_loss / count
        
        #プログラスバーに cer 表示
        phar.set_postfix( loss = avg_loss )   
        
    avg_loss = total_loss / count
    avg_decoder_out_loss = total_decoder_out_loss / count
    avg_postnet_out_loss = total_postnet_out_loss / count
    avg_stop_token_loss = total_stop_token_loss / count    

    print(f"epoch: {epoch+1:3d}, train it: {it_train:6d}, decoder_out: {avg_decoder_out_loss :.5f}, postnet_out: {avg_postnet_out_loss :.5f}, stop_token: {avg_stop_token_loss :.5f}, loss: {avg_loss :.5f}")
    item = np.array([epoch+1, it_train, avg_decoder_out_loss , avg_postnet_out_loss , avg_stop_token_loss , avg_loss ,  current_lr ])
    history = np.vstack((history, item))
    
    model.eval()
    total_dev_decoder_out_loss = 0
    total_dev_postnet_out_loss = 0
    total_dev_stop_token_loss = 0
    total_dev_loss = 0
    count = 0
    # DataLoader を用いたミニバッチの作成: ミニバッチ毎に処理する
    phar = tqdm( range( len(data_loader_dev) ), desc='dev' )
    Iter_dev = iter(data_loader_dev)
    for i in phar:
    #for in_feats, in_lens, out_feats, out_lens, stop_flags in tqdm(data_loader_dev):
        in_feats, in_lens, out_feats, out_lens, stop_flags = next(Iter_dev)
        in_feats = in_feats.to(device)
        in_lens = in_lens.to(device)
        out_feats = out_feats.to(device)
        out_lens = out_lens.to(device)
        stop_flags = stop_flags.to(device)
        in_lens, indices = torch.sort(in_lens, dim=0, descending=True)
        in_feats, out_feats, out_lens = in_feats[indices], out_feats[indices], out_lens[indices]
        out_feats0 = torch.zeros_like( out_feats )
        out_feats0[:,1:,:] = out_feats[:,:-1,:]
        
        #count += len( in_feats )
        count += 1
   
        #outs, outs_fine, logits, _ = model(in_feats, in_lens, out_feats)
        outs, outs_fine, logits, _ = model(in_feats, in_lens, out_feats0)
        #out_feats2 = torch.zeros_like( out_feats )
        #out_feats2[:,:-1,:] = out_feats[:,1:,:]    
        #stop_flags2 = torch.ones_like( stop_flags )
        #stop_flags2[:,:-1] = stop_flags[:,1:] 
        
        # ゼロパディグした部分を損失関数のの計算から除外するためにマスクを適用します
        # Mask (B x T x 1)
        mask = make_non_pad_mask(out_lens).unsqueeze(-1)
        #out_feats2 = out_feats2.masked_select(mask)
        out_feats = out_feats.masked_select(mask)
        outs = outs.masked_select(mask)
        outs_fine = outs_fine.masked_select(mask)
        stop_flags = stop_flags.masked_select(mask.squeeze(-1))
        #stop_flags2 = stop_flags2.masked_select(mask.squeeze(-1))
        logits = logits.masked_select(mask.squeeze(-1))
        
        # 損失の計算
        #print( " size of outs:{}".format( outs.size()))
        #print( " size of out_feats2:{}".format( out_feats2.size()))
        #dev_decoder_out_loss = nn.MSELoss(reduction='mean')(outs, out_feats2)
        #dev_decoder_out_loss = nn.MSELoss()(outs, out_feats2)
        #dev_postnet_out_loss = nn.MSELoss(reduction='mean')(outs_fine, out_feats2)
        #dev_postnet_out_loss = nn.MSELoss()(outs_fine, out_feats2)
        dev_decoder_out_loss = nn.MSELoss()(outs, out_feats)
        dev_postnet_out_loss = nn.MSELoss()(outs_fine, out_feats) 
        dev_stop_token_loss = nn.BCEWithLogitsLoss()(logits, stop_flags)
        #dev_stop_token_loss = nn.BCEWithLogitsLoss(reduction='mean')(logits, stop_flags2)
        #dev_stop_token_loss = nn.BCEWithLogitsLoss()(logits, stop_flags2)
        
        # 損失の合計
        dev_loss = dev_decoder_out_loss + dev_postnet_out_loss + dev_stop_token_loss
        
        total_dev_loss += dev_loss.item()
        total_dev_decoder_out_loss += dev_decoder_out_loss.item()
        total_dev_postnet_out_loss += dev_postnet_out_loss.item()
        total_dev_stop_token_loss += dev_stop_token_loss.item()
        
        avg_dev_loss = total_dev_loss / count
        
        #プログラスバーに cer 表示
        phar.set_postfix( dev_loss = avg_dev_loss ) 

        # 損失の値を出力
        it_dev += 1
        
    avg_dev_loss = total_dev_loss / count
    avg_dev_decoder_out_loss = total_dev_decoder_out_loss / count
    avg_dev_postnet_out_loss = total_dev_postnet_out_loss / count
    avg_dev_stop_token_loss = total_dev_stop_token_loss / count    

    print(f"epoch: {epoch+1:3d}, dev it: {it_dev:6d}, decoder_out: {avg_dev_decoder_out_loss:.5f}, postnet_out: {avg_dev_postnet_out_loss:.5f}, stop_token: {avg_dev_stop_token_loss:.5f}, loss: {avg_dev_loss:.5f}")
    item = np.array([epoch+1, it_dev, avg_dev_decoder_out_loss , avg_dev_postnet_out_loss , avg_dev_stop_token_loss , avg_dev_loss ])
    history_dev = np.vstack((history_dev, item))

    if ( epoch + 1 ) == num_epochs:
        epoch_str = "last_" + format(epoch+1,"04d")
    
        hist_df = pd.DataFrame(history)
        filename_his = "./ch96/history_ch96_" + epoch_str + ".csv"
        hist_df.to_csv(filename_his, header=False, index=False)
        hist_dev_df = pd.DataFrame(history_dev)
        filename_his_dev = "./ch96/history_dev_ch96_" + epoch_str + ".csv"
        hist_dev_df.to_csv(filename_his_dev, header=False, index=False)    
    
        save_path = "./ch96/transtron_weight_training_ch96_" + epoch_str + ".pt"
        torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
            'dev_loss': avg_dev_loss,},
           save_path)
    elif (epoch + 1) % 100 == 0:
        epoch_str = format(epoch+1,"04d")
    
        hist_df = pd.DataFrame(history)
        filename_his = "./ch96/history_ch96_" + epoch_str + ".csv"
        hist_df.to_csv(filename_his, header=False, index=False)
        hist_dev_df = pd.DataFrame(history_dev)
        filename_his_dev = "./ch96/history_dev_ch96_" + epoch_str + ".csv"
        hist_dev_df.to_csv(filename_his_dev, header=False, index=False)    
    
        save_path = "./ch96/transtron_weight_training_ch96_" + epoch_str + ".pt"
        torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
            'dev_loss': avg_dev_loss,},
           save_path)

学習を評価するためのプログラム

モデルのロード

#モデルのロード

import torch
import pandas as pd

model = Transtron(
    num_vocab=52,
    embed_dim=512,
    conv_layers=3,
    conv_channels=512,
    conv_kernel_size=5,
    enc_hidden_dim = 512,
    num_enc_layers = 8,
    enc_num_heads = 4,
    enc_dropout_rate = 0.1,
    conv_dropout_rate = 0.1,
    enc_input_maxlen = 300,
    enc_ffn_dim = 2048,          
    decoder_hidden_dim=512,
    out_dim=80,
    num_dec_layers=8,
    prenet_layers=2,
    prenet_hidden_dim=512,
    prenet_dropout=0.5,
    dec_ffn_dim=2048,
    dec_dropout_rate = 0.1,
    dec_input_maxlen=3000,
    dec_num_heads = 4,                
    postnet_in_dim=80,
    postnet_layers=5,
    postnet_channels=512,
    postnet_kernel_size=5,
    postnet_dropout=0.5
)

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device =  torch.device("cpu")
optimizer = optim.Adam(model.parameters(), lr=0.0001)

model = model.to(device)

model.eval()

save_path2 = "./ch96/transtron_weight_training_ch96_0100.pt"
checkpoint = torch.load(save_path2, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#for state in optimizer.state.values():
#    for k, v in state.items():
#        if isinstance(v, torch.Tensor):
#            state[k] = v.to('cuda')

ヒストリーのロード

#ヒストリーのロード

import numpy as np
history = np.loadtxt("./ch96/history_ch96_0100.csv", delimiter=",", dtype = "unicode")
history = [[ float(s) for s in s1 ] for s1 in history]
history = np.array( history )
print( history )
history_dev = np.loadtxt("./ch96/history_dev_ch96_0100.csv", delimiter=",", dtype = "unicode")
history_dev = [[ float(s) for s in s1 ] for s1 in history_dev]
history_dev = np.array( history_dev )
print( history_dev )

ヒストリーの可視化

#ヒストリーの可視化

import pandas as pd

# デフォルトフォントサイズ変更
plt.rcParams['font.size'] = 10

# 学習ログ解析
def evaluate_history(history, history_val):
    #損失と精度の確認
    print(f'decoder_out_loss: {history[0,2]:.5f}, postnet_out_loss: {history[0,3]:.5f}, stop_token_loss: {history[0,4]:.5f}, loss: {history[0,5]:.5f}, lr: {history[0,6]:.5f}') 
    print(f'decoder_out_loss: {history[-1,2]:.5f}, postnet_out_loss: {history[-1,3]:.5f}, stop_token_loss: {history[-1,4]:.5f}, loss: {history[-1,5]:.5f}, lr: {history[-1,6]:.5f}' )

    it_train = history[-1,1]
    print( it_train )
    if it_train < 10:
      unit = 1
    else:
      unit = it_train // 10

    # 学習曲線の表示 (損失) train
    plt.figure(figsize=(9,8))
    plt.plot(history[:,1], history[:,2], 'y', label='decoder_out_loss')
    plt.plot(history[:,1], history[:,3], 'k', label='postnet_out_loss')
    plt.plot(history[:,1], history[:,4], 'r', label='stop_token_loss')
    plt.plot(history[:,1], history[:,5], 'b', label='loss')
    plt.xticks(np.arange(0,it_train+1, unit))
    plt.xlabel('繰り返し回数')
    plt.ylabel('損失')
    plt.title('学習曲線(損失)')
    plt.legend()
    plt.show()

    it_dev = history_dev[-1,1]
    if it_dev < 10:
      unit = 1
    else:
      unit = it_dev // 10    
    
    # 学習曲線の表示 (損失) dev
    plt.figure(figsize=(9,8))
    plt.plot(history_dev[:,1], history_val[:,2], 'y', label='decoder_out_loss')
    plt.plot(history_dev[:,1], history_val[:,3], 'k', label='postnet_out_loss')
    plt.plot(history_dev[:,1], history_val[:,4], 'r', label='stop_token_loss')
    plt.plot(history_dev[:,1], history_val[:,5], 'b', label='loss')
    plt.xticks(np.arange(0,it_dev+1, unit))
    plt.xlabel('繰り返し回数')
    plt.ylabel('損失')
    plt.title('学習曲線(損失)')
    plt.legend()
    plt.show()

    # lr
    it_train = history[-1,1]
    print( it_train )
    if it_train < 10:
      unit = 1
    else:
      unit = it_train // 10
    plt.figure(figsize=(9,8))
    plt.plot(history[:,1], history[:,6], 'b', label='lr')
    plt.xticks(np.arange(0,it_train+1,unit))
    plt.xlabel('繰り返し回数')
    plt.ylabel('lr')
    plt.title('lr')
    plt.legend()
    plt.show()

evaluate_history( history, history_dev)

inference 関数

#最終的には、model.inference にしたいのだが、inference 関数を修正しながらメルスペクトルを見るので、ここの関数を使っている。

model.eval()

def inference( in_feats ):
    
    """Performs inference over one batch of inputs using greedy decoding."""
    #print( in_feats )
    in_feats = torch.unsqueeze( in_feats, axis = 0 )
    bs = in_feats.size()[0]
    in_lens = []
    for feats in ( in_feats):
        in_lens.append( len( feats ))
    # エンコーダによるテキストに潜在する表現の獲得
    encoder_outs = model.encoder(in_feats, in_lens)
    decoder_targets_maxlen = in_lens[0] * 10
    decoder_targets = encoder_outs.new_zeros((encoder_outs.size()[0], 1, 80))
    for i in range(decoder_targets_maxlen ):
        print( "i:{}".format( i ))
        # デコーダによるメルスペクトログラム、stop token の予測
        outs, logits, att_ws = model.decoder(encoder_outs, in_lens, decoder_targets)
        if i > 40 and torch.sigmoid(logits[0, -1]) >= 0.5:
            break
        outs = torch.permute(outs, (0, 2, 1))
        outs2 = torch.unsqueeze( outs[:,-1,:], axis = 1 )
        decoder_targets = torch.cat( (decoder_targets, outs2), axis = 1 )

    # Post-Net によるメルスペクトログラムの残差の予測
    outs = torch.permute(decoder_targets, (0, 2, 1))
    outs_fine = outs + model.postnet(outs)

    # (B, C, T) -> (B, T, C)
    outs = outs.transpose(2, 1)
    outs_fine = outs_fine.transpose(2, 1)
    

    return outs[0], outs_fine[0], logits[0], att_ws

ineference の計算

#自然音声のメルスペクトルと transtron で合成したメルスペクトルを比較するため inference の計算。

from ttslearn.util import find_lab, find_feats
from ttslearn.tacotron.frontend.openjtalk import text_to_sequence, pp_symbols
from ttslearn.util import find_lab, find_feats
from ttslearn.tacotron.frontend.openjtalk import text_to_sequence, pp_symbols
# 数値演算
import numpy as np
import torch
from torch import nn
# 音声波形の読み込み
from scipy.io import wavfile
# フルコンテキストラベル、質問ファイルの読み込み
from nnmnkwii.io import hts
# 音声分析
import pyworld
# 音声分析、可視化
import librosa
import librosa.display
import pandas as pd
# Pythonで学ぶ音声合成
import ttslearn
from tqdm.notebook import tqdm

train_utt = "BASIC5000_0001"
test_utt = "BASIC5000_5000"
#test_utt = "BASIC5000_4999"

labels = hts.load(find_lab("downloads/jsut-label/", test_utt))

in_feats = text_to_sequence(pp_symbols(labels.contexts))
in_feats = torch.tensor(in_feats, dtype=torch.long)

with torch.no_grad():
    out_feats, out_feats_fine, stop_flags, alignment0 = inference( in_feats )
    
# 比較用に、自然音声から抽出された音響特徴量を読み込みむ
feats = np.load(find_feats("dump/jsut_sr16000/norm/", test_utt, typ="out_tacotron"))

teacher forcing のメルスペクトログラムの計算

#teacher forcing のメルスペクトログラムを得る。

from pathlib import Path
from ttslearn.train_util import Dataset
from functools import partial
from nnmnkwii.io import hts
# teacher forcing での予測のスペクトルを作る。

labels = hts.load(find_lab("downloads/jsut-label/", test_utt))

in_feats = text_to_sequence(pp_symbols(labels.contexts))
in_feats = torch.tensor(in_feats, dtype=torch.long, requires_grad=False)

in_feats2 = torch.unsqueeze( in_feats, axis = 0 )

print("size of in_feats2:{}".format( in_feats2.size()))

in_lens = []
for lens in in_feats2:
    in_lens.append( len(lens ))
in_lens = torch.tensor( in_lens, requires_grad=False )
in_lens = torch.unsqueeze( in_lens, axis = 0)    
print( "size of in_lens:{}".format(in_lens.size()) )


out_path = Path( "./dump/jsut_sr16000/norm/eval/out_tacotron/" + test_utt + "-feats.npy" )
out_feats = np.load( out_path )
out_feats = torch.tensor( out_feats, requires_grad=False )

out_feats = torch.unsqueeze( out_feats, axis = 0 )

print("size of out_feats:{}".format( out_feats.size() ))

# エンコーダによるテキストに潜在する表現の獲得
encoder_outs = model.encoder(in_feats2, in_lens)

decoder_targets = out_feats

# デコーダによるメルスペクトログラム、stop token の予測
outs, logits, alignment = model.decoder(encoder_outs, in_lens, decoder_targets)

# Post-Net によるメルスペクトログラムの残差の予測
outs_fine = outs + model.postnet(outs)

# (B, C, T) -> (B, T, C)
outs = outs.transpose(2, 1)
outs_fine = outs_fine.transpose(2, 1)

三つのメルスペクトログラムの可視化

#自然音声のメルスペクトルの transtron で合成したメルスペクトルの可視化（自然音声、teacher forcing, inference )
from pathlib import Path
from ttslearn.notebook import get_cmap, init_plot_style, savefig

cmap = get_cmap()
sr = 16000

fig, ax = plt.subplots(3, 1, figsize=(8,6))
ax[0].set_title("Mel-spectrogram of natural speech")
ax[1].set_title("Mel-spectrogram of Transtron output teacher forcing")
ax[2].set_title("Mel-spectrogram of Transtron output inference")

mindb = min(feats.min(), out_feats_fine.min())
maxdb = max(feats.max(), out_feats_fine.max())

hop_length = int(sr * 0.0125)
# 比較用に、自然音声から抽出された音響特徴量を読み込みむ

feats = np.load(find_feats("dump/jsut_sr16000/norm/", test_utt, typ="out_tacotron"))

mesh = librosa.display.specshow(
    feats.T, sr=sr, x_axis="time", y_axis="frames", hop_length=hop_length, cmap=cmap, ax=ax[0])
mesh.set_clim(mindb, maxdb)
fig.colorbar(mesh, ax=ax[0])

outs_fine3 = outs_fine[0,:,:]
mesh = librosa.display.specshow(
    outs_fine3.data.numpy().T, sr=sr, x_axis="time", y_axis="frames", hop_length=hop_length, cmap=cmap, ax=ax[1])
mesh.set_clim(mindb, maxdb)
fig.colorbar(mesh, ax=ax[1])
print( "size of out_fine3:{}".format( outs_fine3.size() ))

print( "size of out_feats_fine:{}".format( out_feats_fine.size()))
mesh = librosa.display.specshow(
    out_feats_fine.data.numpy().T, sr=sr, x_axis="time", y_axis="frames", hop_length=hop_length, cmap=cmap, ax=ax[2])
mesh.set_clim(mindb, maxdb)
fig.colorbar(mesh, ax=ax[2])


for a in ax:
    a.set_xlabel("Time [sec]")
    a.set_ylabel("Mel filter channel")
fig.tight_layout()

# 図10-8
savefig("./fig/e2etts_impl_logmel_comp")

stop token の可視化

#Stop token の可視化
print( stop_flags.size())
#stop_flags = torch.squeeze( stop_flags )
#print( stop_flags.size())
fig, ax = plt.subplots(figsize=(6,4))
ax.plot(torch.sigmoid(stop_flags).cpu().numpy())
ax.set_xlabel("Time [frame]")
ax.set_ylabel("Stop probability");

合成音声の終端計算

# 合成音声の終端を計算する。
#out_feats_fine5 の可視化

print( "size of out_feats_fine:{}".format( out_feats_fine.size()))

out_feats_fine5 = torch.sum( out_feats_fine, axis = 1)

fig, ax = plt.subplots(figsize=(6,4))
ax.plot(out_feats_fine5.cpu().numpy())
ax.set_xlabel("Time [frame]")
ax.set_ylabel("out_feats_fine5");


for i, x in enumerate( out_feats_fine5 ):
    flag = 1
    for j in range(10):
        if i + j >= out_feats_fine5.size(0):
            break
        if out_feats_fine5[ i+j ] < -50:
            flag *= 1
        else:
            flag *= 0
    if flag == 1:
        break
    else:
        flag = 1
            
print( "i:{}".format( i ))

onsei_end = i

アテンション重みの可視化

#アテンション重みの可視化
fig1, ax1 = plt.subplots(figsize=(14,10))
fig2, ax2 = plt.subplots(figsize=(14,10))
fig3, ax3 = plt.subplots(figsize=(14,10))
fig4, ax4 = plt.subplots(figsize=(14,10))
fig5, ax5 = plt.subplots(figsize=(14,10))
fig6, ax6 = plt.subplots(figsize=(14,10))
fig7, ax7 = plt.subplots(figsize=(14,10))
fig8, ax8 = plt.subplots(figsize=(14,10))
fig9, ax9 = plt.subplots(figsize=(14,10))
fig10, ax10 = plt.subplots(figsize=(14,10))
fig11, ax11 = plt.subplots(figsize=(14,10))
fig12, ax12 = plt.subplots(figsize=(14,10))
fig13, ax13 = plt.subplots(figsize=(14,10))
fig14, ax14 = plt.subplots(figsize=(14,10))
fig15, ax15 = plt.subplots(figsize=(14,10))
fig16, ax16 = plt.subplots(figsize=(14,10))

alignment1 = torch.sum( alignment0["0"][0], axis = 0 )
alignment2 = torch.sum( alignment0["1"][0], axis = 0 )
alignment3 = torch.sum( alignment0["2"][0], axis = 0 )
alignment4 = torch.sum( alignment0["3"][0], axis = 0 )
alignment5 = torch.sum( alignment0["4"][0], axis = 0 )
alignment6 = torch.sum( alignment0["5"][0], axis = 0 )
alignment7 = torch.sum( alignment0["6"][0], axis = 0 )
alignment8 = torch.sum( alignment0["7"][0], axis = 0 )
alignment9 = torch.sum( alignment0["8"][0], axis = 0 )
alignment10 = torch.sum( alignment0["9"][0], axis = 0 )
alignment11 = torch.sum( alignment0["10"][0], axis = 0 )
alignment12 = torch.sum( alignment0["11"][0], axis = 0 )
alignment13 = torch.sum( alignment0["12"][0], axis = 0 )
alignment14 = torch.sum( alignment0["13"][0], axis = 0 )
alignment15 = torch.sum( alignment0["14"][0], axis = 0 )
alignment16 = torch.sum( alignment0["15"][0], axis = 0 )

im1 = ax1.imshow(alignment1.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig1.colorbar(im1, ax=ax1)
ax1.set_xlabel("Decoder time step [frame]")
ax1.set_ylabel("Decoder time step [phoneme]")
im2 = ax2.imshow(alignment2.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig2.colorbar(im2, ax=ax2)
ax2.set_ylabel("Encoder time step [frame]")
ax2.set_xlabel("Decoder time step [phoneme]")
im3 = ax3.imshow(alignment3.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig3.colorbar(im3, ax=ax3)
ax3.set_xlabel("Decoder time step [frame]")
ax3.set_ylabel("Decoder time step [phoneme]")
im4 = ax4.imshow(alignment4.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig4.colorbar(im4, ax=ax4)
ax4.set_ylabel("Encoder time step [frame]")
ax4.set_xlabel("Decoder time step [phoneme]")
im5 = ax5.imshow(alignment5.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig5.colorbar(im5, ax=ax5)
ax5.set_xlabel("Decoder time step [frame]")
ax5.set_ylabel("Decoder time step [phoneme]")
im6 = ax6.imshow(alignment6.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig6.colorbar(im6, ax=ax6)
ax6.set_ylabel("Encoder time step [frame]")
ax6.set_xlabel("Decoder time step [phoneme]")
im7 = ax7.imshow(alignment7.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig7.colorbar(im7, ax=ax7)
ax7.set_xlabel("Decoder time step [frame]")
ax7.set_ylabel("Decoder time step [phoneme]")
im8 = ax8.imshow(alignment8.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig8.colorbar(im8, ax=ax8)
ax8.set_ylabel("Encoder time step [frame]")
ax8.set_xlabel("Decoder time step [phoneme]")
im9 = ax9.imshow(alignment9.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig9.colorbar(im9, ax=ax9)
ax9.set_xlabel("Decoder time step [frame]")
ax9.set_ylabel("Decoder time step [phoneme]")
im10 = ax10.imshow(alignment10.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig10.colorbar(im10, ax=ax10)
ax10.set_ylabel("Encoder time step [frame]")
ax10.set_xlabel("Decoder time step [phoneme]")
im11 = ax11.imshow(alignment11.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig11.colorbar(im11, ax=ax11)
ax11.set_xlabel("Decoder time step [frame]")
ax11.set_ylabel("Decoder time step [phoneme]")
im12 = ax12.imshow(alignment12.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig12.colorbar(im12, ax=ax12)
ax12.set_ylabel("Encoder time step [frame]")
ax12.set_xlabel("Decoder time step [phoneme]")
im13 = ax13.imshow(alignment13.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig13.colorbar(im13, ax=ax13)
ax13.set_xlabel("Decoder time step [frame]")
ax13.set_ylabel("Decoder time step [phoneme]")
im14 = ax14.imshow(alignment14.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig14.colorbar(im14, ax=ax14)
ax14.set_ylabel("Encoder time step [frame]")
ax14.set_xlabel("Decoder time step [phoneme]")
im15 = ax15.imshow(alignment15.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig15.colorbar(im15, ax=ax15)
ax15.set_xlabel("Decoder time step [frame]")
ax15.set_ylabel("Decoder time step [phoneme]")
im16 = ax16.imshow(alignment16.cpu().data.numpy().T, aspect="auto", origin="lower", interpolation="nearest")
fig16.colorbar(im16, ax=ax16)
ax16.set_ylabel("Encoder time step [frame]")
ax16.set_xlabel("Decoder time step [phoneme]")

ボコーダーの読み込み

#wavenet ボコーダーの読み込み
import hydra
from omegaconf import OmegaConf

device = "cpu"

wavenet_config_name="wavenet_sr16k_mulaw256_30layers"
wavenet_config = OmegaConf.load(f"exp/jsut_sr16000/{wavenet_config_name}/model.yaml")
wavenet_model = hydra.utils.instantiate(wavenet_config.netG)
checkpoint = torch.load(f"exp/jsut_sr16000/{wavenet_config_name}/latest_ema.pth", map_location=device)
wavenet_model.load_state_dict(checkpoint["state_dict"])
# weight normalization は推論時には不要なため除く
wavenet_model.remove_weight_norm_()
wavenet_model.eval();

音声波形の生成関数

#音声波形の生成関数
from ttslearn.dsp import inv_mulaw_quantize

@torch.no_grad()
def gen_waveform(wavenet_model, out_feats):
    # (B, T, C) -> (B, C, T)
    c = out_feats.view(1, -1, out_feats.size(-1)).transpose(1, 2)

    # 音声のサンプル数を計算
    upsample_scale = np.prod(wavenet_model.upsample_scales)
    T = (
        c.shape[-1] - wavenet_model.aux_context_window * 2
    ) * upsample_scale

    # WaveNet による音声波形の生成
    # NOTE: 計算に時間がかかるため、tqdm によるプログレスバーを受け付けるようにしています
    gen_wav = wavenet_model.inference(c, T, tqdm)

    # One-hot ベクトルから1次元の信号に変換
    gen_wav = gen_wav.max(1)[1].float().cpu().numpy().reshape(-1)

    # Mu-law 量子化の逆変換
    gen_wav = inv_mulaw_quantize(
        gen_wav, wavenet_model.out_channels - 1
    )
    
    return gen_wav

wavenet で音声合成

# wavenet で音声合成
from ttslearn.util import find_lab, find_feats
from ttslearn.dsp import logmelspectrogram_to_audio

out_feats_fine2 = out_feats_fine[:onsei_end+10]

# WaveNet ボコーダによる音声波形の生成
gen_wav = gen_waveform(wavenet_model, out_feats_fine2)

音声確認

from IPython.display import Audio
import IPython

sr = 16000
IPython.display.display(Audio(gen_wav, rate=sr))

from scipy.io.wavfile import read, write
writefilename = "./test_inference.wav"
write(writefilename, rate=16000, data=gen_wav)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up