23/11/12記載
目的
RNA(リボ核酸)の配列毎にDMS(Dimethyl sulfate reactivity)および2A3(2-aminopyridine-3-carboxylic acid imidazolide SHAPE)に対する反応性を評価し、MAEを最小にするモデルを作成する
データ
1 train_data.csv(データ数:806573)
カラム | 説明 |
---|---|
sequence_id | ID、8cdfeef009eaなど |
sequence | RNA配列で、A、C、G、Uの文字列 |
experiment_type | DMS_MaPまたは2A3_MaP |
dataset_name | データセットの名前、"15k_2A3"など46種類 |
reads | int型、ハイスループットシーケンシング実験のリード数(?) |
signal_to_noise | float型、プロファイルの信号対ノイズ値(?) |
SN_filter | フィルター値(条件:signal_to_noise > 1.00かつreads > 100)、フィルターを突破したデータのみがsubmitの採点対象 |
reactivity_0001, reactivity_0002,… | RNAの反応性、予測する必要あり、正規化済 |
reactivity_error_0001, reactivity_error_0002,… | 反応性値の実験誤差 |
2 train_data_QUICK_START.csv:filtered version of train_data.csv based on SN_filter
→1.train_data.csvではなくフィルタリングした2を使えば良い?
3 sample_submission.csv:submitするファイルのフォーマット、IDに対してDMSと2A3に対する反応性を予測
4 test_sequences.csv:実測値に関連するカラムが含まれていないテストセットのシーケンス
○トレーニングデータについて
例えば、...AGU...という塩基配列があったとき、 Aについて、Gについて、UについてそれぞれDMSと2A3の反応性を算出する必要がある。
→よって、DMS用と2A3用のモデルを作る必要がある(?)
○予測用のデータについて
test_sequences.csvの1行目を見ると、id:0-176で、"GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGGGAGAGAUAGAGGCGGAGGGUCUGGGGGAGGAAUUAAAACACAAGGUCUCCUCCC..."と記載されている。
つまり、
id | 塩基 | DMS | 2A3 |
---|---|---|---|
0 | G | 0 | 0.3 |
1 | G | 0.3 | 0.6 |
2 | G | 0.7 | -0.1 |
3 | A | -0.1 | 0 |
のように269796670個の塩基全てに対して、DMSと2A3を予測する必要がある
(-の値になり得る)
手法について
とりあえず、塩基(A,C,G,U)をそれぞれ(0,1,2,3)に変更して、これを説明変数として回帰する必要がある。ひょっとしたらシーケンス特徴量(塩基配列から算出できる特徴量)や二次構造、物理化学的な特徴量も入れたほうが良いのかも。
①"train_data.csv"を読み込み、2A3とDMSで分ける
②transformerで学習(とりあえず2A3だけ)
③適当なデータ1つを予測して様子見
①
import pandas as pd
df = pd.read_csv('/kaggle/input/stanford-ribonanza-rna-folding/train_data.csv')
df_2A3 = df[df['experiment_type'] == '2A3_MaP']
df_DMS = df[df['experiment_type'] == 'DMS_MaP']
df_2A3 = df_2A3.sample(n=10000, replace=False)
df_DMS = df_DMS.sample(n=10000, replace=False)
②(多少のパラメータチューニングはすでにしている、epoch数とk-foldは最小に記載)
#######
##学習##
#######
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import time
import math
from sklearn.model_selection import KFold
from torch.nn.utils.rnn import pad_sequence
# 辞書の定義:RNAのヌクレオチドを整数にマッピング
base_to_int = {'A': 0, 'C': 1, 'G': 2, 'U': 3, 'PAD': 4}
class RNADataset(Dataset):
def __init__(self, dataframe, max_length, context_size=3):
self.dataframe = dataframe
self.max_length = max_length
self.context_size = context_size
self.pad_id = base_to_int['PAD']
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
sequence = self.dataframe.iloc[idx, 1]
context_padded_sequence = self.get_context_padded_sequence(sequence)
# パディングの調整
padded_sequence = context_padded_sequence + [self.pad_id] * (self.max_length - len(context_padded_sequence))
padded_sequence = padded_sequence[:self.max_length]
reactivity = self.dataframe.iloc[idx, 6:212].fillna(0).values.astype(float)
reactivity_padded = list(reactivity) + [0] * (self.max_length - len(reactivity))
reactivity_padded = reactivity_padded[:self.max_length]
mask = [1 if i < len(context_padded_sequence) else 0 for i in range(self.max_length)]
return torch.tensor(padded_sequence, dtype=torch.long), torch.tensor(reactivity_padded, dtype=torch.float), torch.tensor(mask, dtype=torch.float)
def get_context_padded_sequence(self, sequence):
# Add context to each nucleotide
padded_sequence = []
for i in range(len(sequence)):
context = sequence[max(0, i - self.context_size):min(len(sequence), i + self.context_size + 1)]
context = [self.pad_id] * (self.context_size - i) + [base_to_int[base] for base in context] + [self.pad_id] * (i + self.context_size + 1 - len(sequence))
padded_sequence.extend(context)
return padded_sequence
def collate_fn(batch):
sequences, reactivities, masks = zip(*batch)
# シーケンスの長さを計算
lengths = [len(seq) for seq in sequences]
# パディング処理
sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=base_to_int['PAD'])
reactivities_padded = pad_sequence(reactivities, batch_first=True, padding_value=0)
masks_padded = pad_sequence(masks, batch_first=True, padding_value=0)
return sequences_padded, reactivities_padded, masks_padded, torch.tensor(lengths)
# 位置エンコーディングのクラス
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.6, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0)]
return self.dropout(x)
# トランスフォーマーモデルのクラス
class TransformerModel(nn.Module):
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.6):
super(TransformerModel, self).__init__()
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(ninp, dropout)
# 隠れ層のサイズ、マルチヘッドアテンションのヘッド数、層の数を増やす
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.encoder = nn.Embedding(ntoken, ninp)
self.ninp = ninp
self.decoder = nn.Linear(ninp, 1)
self.init_weights()
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, src):
src = self.encoder(src) * math.sqrt(self.ninp)
src = self.pos_encoder(src)
output = self.transformer_encoder(src)
output = self.decoder(output)
return output.squeeze(-1)
# 損失関数:マスク付きMSE
class MaskedHuberLoss(nn.Module):
def __init__(self, delta=1.0):
super(MaskedHuberLoss, self).__init__()
self.delta = delta
def forward(self, input, target, mask):
loss = torch.where(torch.abs(input - target) < self.delta,
0.5 * (input - target) ** 2,
self.delta * (torch.abs(input - target) - 0.5 * self.delta))
loss = loss * mask
return loss.sum() / mask.sum()
# ハイパーパラメータ
ntokens = len(base_to_int) # ボキャブラリサイズ
emsize = 300 # 埋め込み次元を300に増やす
nhid = 300 # 隠れ層のサイズを300に増やす
nlayers = 2 # トランスフォーマー内の層の数を6に増やす
nhead = 100 # マルチヘッドアテンションのヘッド数を6に増やす
dropout = 0.6 # ドロップアウト値を0.6に増やす
weight_decay = 0.0001
# データの読み込みと前処理
max_length = max(df_2A3['sequence'].apply(len))
dataset = RNADataset(df_2A3, max_length)
# クロスバリデーションの設定
kf = KFold(n_splits=3)
for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
print(f"Fold {fold + 1}")
train_dataset = torch.utils.data.Subset(dataset, train_index)
test_dataset = torch.utils.data.Subset(dataset, test_index)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
# モデルと損失関数の初期化
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)
loss_fn = MaskedHuberLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=0.0001)
# トレーニングループ
num_epochs = 3
for epoch in range(num_epochs):
model.train()
total_loss = 0.0
for batch, (seq, target, mask, lengths) in enumerate(train_dataloader):
optimizer.zero_grad()
output = model(seq)
loss = loss_fn(output, target, mask)
loss.backward()
optimizer.step()
total_loss += loss.item()
start_time = time.time()
if batch % 10 == 0 and batch > 0:
current = batch * len(seq)
total = len(train_dataloader.dataset)
elapsed = time.time()
print(f'| Epoch {epoch+1:3d}/{num_epochs:3d} | {current:5d}/{total:5d} sequences | Elapsed time {elapsed:.2f}s')
elapsed_time = time.time() - start_time
print(f'Epoch {epoch+1:3d} completed in {elapsed_time:.2f}s, Total loss {total_loss:.4f}')
torch.save(model.state_dict(), '/kaggle/working/model_state_dict_2A3.pth')
# テストデータでの評価
model.eval()
total_test_loss = 0
with torch.no_grad():
for batch, (data, target, mask, lengths) in enumerate(test_dataloader): # 4つの要素をアンパック
output = model(data)
loss = loss_fn(output, target, mask)
total_test_loss += loss.item()
print(f"Fold {fold + 1} Test Loss: {total_test_loss / len(test_dataloader)}")
③
#######
##予測##
#######
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import time
import math
from sklearn.model_selection import KFold
from torch.nn.utils.rnn import pad_sequence
# データの読み込み
df_test = pd.read_csv('/kaggle/input/stanford-ribonanza-rna-folding/test_sequences.csv')
# データを20分割して保存
num_splits = 20
split_size = len(df_test) // num_splits
for i in range(num_splits):
start_index = i * split_size
end_index = start_index + split_size
if i == num_splits - 1:
end_index = len(df_test) # 最後の分割では残り全部を取得
df_split = df_test.iloc[start_index:end_index]
df_split.to_csv(f'/kaggle/working/split_{i}.csv', index=False)
import pandas as pd
import time
import torch
from tqdm import tqdm
base_to_int = {'A': 0, 'C': 1, 'G': 2, 'U': 3, 'PAD': 4}
def get_context_padded_sequence(sequence, context_size=2, pad_id=base_to_int['PAD']):
padded_sequence = []
for i in range(len(sequence)):
context = sequence[max(0, i - context_size):min(len(sequence), i + context_size + 1)]
context = [pad_id] * (context_size - i) + [base_to_int[base] for base in context] + [pad_id] * (i + context_size + 1 - len(sequence))
padded_sequence.extend(context)
return padded_sequence
max_length = max(df_2A3['sequence'].apply(len))
ntokens = len(base_to_int) # ボキャブラリのサイズ
emsize = 300 # 埋め込みの次元
nhid = 300 # 隠れ層のサイズ
nlayers = 2 # トランスフォーマー内の層の数
nhead = 100 # マルチヘッドアテンションのヘッド数
dropout = 0.6 # ドロップアウト率
# モデルの初期化
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)
model.load_state_dict(torch.load('/kaggle/input/model-state-dict-2a3-pth/model_state_dict_2A3.pth'))
model.eval()
start_time = time.time()
num_splits = 20
for i in range(num_splits):
# Load split file
df_split = pd.read_csv(f'/kaggle/working/split_{i}.csv')
predictions = []
for sequence in tqdm(df_split['sequence'], desc=f'Processing split {i}'):
# Preprocess sequence
context_padded_sequence = get_context_padded_sequence(sequence)
padded_sequence = context_padded_sequence + [base_to_int['PAD']] * (max_length - len(context_padded_sequence))
sequence_tensor = torch.tensor(padded_sequence, dtype=torch.long).unsqueeze(0)
# Prediction
with torch.no_grad():
prediction = model(sequence_tensor)
# Get and process prediction results
predicted_reactivity = prediction.squeeze(0).tolist()
predicted_reactivity_trimmed = predicted_reactivity[:len(sequence)]
predictions.append(predicted_reactivity_trimmed)
# Update DataFrame and save file after processing each split
df_split['reactivity_2A3_MaP'] = predictions
df_split.to_csv(f'/kaggle/working/predicted_sequences_2A3_{i}.csv', index=False)
# Calculate elapsed and estimated remaining time
elapsed_time = time.time() - start_time
estimated_remaining_time = (elapsed_time / (i + 1)) * (num_splits - i - 1)
print(f"Completed split {i}. Estimated remaining time: {estimated_remaining_time:.2f} seconds")
同様にDMSも行う。