「VAEとGANで分子生成入門」の続編です。今度は 長・短期記憶 LSTM (Long Short Term Memory) ネットワークで分子生成モデルを作ったというお話です。ただし、あくまで入門なので、生成された分子が目的の物性を持ってるかどうかは全く考えません。
RDKit
生成した文字列がSMILESとして成立するかどうかを判定する必要があるため、化学情報学系ライブラリ RDKit をインストールして用います。
%%time
!pip install git+https://github.com/maskot1977/rdkit_installer.git
from rdkit_installer import install
install.from_miniconda(rdkit_version="2020.09.1")
Collecting git+https://github.com/maskot1977/rdkit_installer.git
Cloning https://github.com/maskot1977/rdkit_installer.git to /tmp/pip-req-build-keib23qv
Running command git clone -q https://github.com/maskot1977/rdkit_installer.git /tmp/pip-req-build-keib23qv
Building wheels for collected packages: rdkit-installer
Building wheel for rdkit-installer (setup.py) ... [?25l[?25hdone
Created wheel for rdkit-installer: filename=rdkit_installer-0.2.0-py3-none-any.whl size=5768 sha256=f1cc01e152134ce47e60cec71e6da40082d2985837f1d3005321bf27d57b3be7
Stored in directory: /tmp/pip-ephem-wheel-cache-vcb2d1yk/wheels/e6/72/a5/218f5f909a3a87c1ec1ccec03ac61298947fb5f1efa517eefa
Successfully built rdkit-installer
Installing collected packages: rdkit-installer
Successfully installed rdkit-installer-0.2.0
add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
python version: 3.7.13
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.09.1 installation finished!
CPU times: user 1.23 s, sys: 315 ms, total: 1.55 s
Wall time: 1min 14s
データの取得
URL という変数に、pandas DataFrame 形式で読み込めるテーブルデータのURLがあり、その中の SMILES_COL 変数で表される列に SMILES 文字列データがあるものと想定します。
各種パラメータ
MAX_EPOCHS = 200
BATCH_SIZE = 128
SEQUENCE_LENGTH = 4
LEARNING_RATE = 0.001
データセット
タブ区切りデータからSMILES文字列を取得し、vocab のデータに変換し、一定文字列長を表現するテンソルに変換するところまでやってしまいます。
import torch
import pandas as pd
from collections import Counter
class Dataset(torch.utils.data.Dataset):
def __init__(self, url, smiles_col, sequence_length=4):
self.url = url
self.smiles_col = smiles_col
self.sequence_length = sequence_length
self.smiles = []
self.words = self.load_words()
self.uniq_words = self.get_uniq_words()
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
self.words_indexes = [self.word_to_index[w] for w in self.words]
def load_words(self):
train_df = pd.read_csv(self.url)
self.smiles = list(train_df[self.smiles_col])
text = train_df[self.smiles_col].str.cat(sep=' ')
text = "".join(text.split(' '))
return [text[i] for i in range(len(text))]
def get_uniq_words(self):
word_counts = Counter(self.words)
return sorted(word_counts, key=word_counts.get, reverse=True)
def __len__(self):
return len(self.words_indexes) - self.sequence_length
def __getitem__(self, index):
return (
torch.tensor(self.words_indexes[index:index+self.sequence_length]),
torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
)
LSTM Generator モデル
LSTM は様々な使い方が可能な柔軟なモデルであるため逆に分かりにくいのですが、その中で text generation ができるモデルを用います。
なお、 VAE や GAN では SMILES 文字列を one-hot-vector に変換してから用いましたが、ここでは Embedding を用います。(VAE や GAN でも Embedding を用いれば良かったかも知れないけどよく分かりません)
import torch
class LSTM_Generator(torch.nn.Module):
def __init__(self, dataset):
super(LSTM_Generator, self).__init__()
self.lstm_size = 128
self.embedding_dim = 128
self.num_layers = 3
n_vocab = len(dataset.uniq_words)
self.embedding = torch.nn.Embedding(
num_embeddings=n_vocab,
embedding_dim=self.embedding_dim,
)
self.lstm = torch.nn.LSTM(
input_size=self.lstm_size,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
dropout=0.2,
)
self.fc = torch.nn.Linear(self.lstm_size, n_vocab)
def forward(self, x, prev_state):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
torch.zeros(self.num_layers, sequence_length, self.lstm_size))
学習のための関数
次のようにして学習します。なお、途中結果の表示に get_best_smiles 関数を使っていますが、それは省略して構いません。
import torch
import numpy as np
def train(dataset, model):
losses = []
model.train()
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
for epoch in range(MAX_EPOCHS):
state_h, state_c = model.init_state(SEQUENCE_LENGTH)
total_loss = 0
for batch, (x, y) in enumerate(dataloader):
optimizer.zero_grad()
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
loss = criterion(y_pred.transpose(1, 2), y)
total_loss += loss.item()
state_h = state_h.detach()
state_c = state_c.detach()
loss.backward()
optimizer.step()
print("Epoch: {}, Loss: {:.3f}, Generated SMILES: {}".format(
epoch+1,
total_loss,
get_best_smiles(dataset, model)
)
)
losses.append(total_loss)
return losses
get_best_smiles 関数
学習済み LSTM による予測を数回行った後に、SMILES文字列として成立する最長の文字列を取得します。
import random
def get_best_smiles(dataset, model, next_words=100, max_trial=10, start_length=3):
best_smile = ""
for trial in range(max_trial):
starting_text = random.choice(dataset.smiles)[:start_length]
smile = predict(dataset, model, text=starting_text, next_words=next_words)
if len(best_smile) < len(smile):
best_smile = smile
return best_smile
予測のための関数
学習済み LSTM を用いて文字列を生成し、SMILES文字列として成立する部分だけを切り出します。
def predict(dataset, model, text, next_words=50):
words = [text[i] for i in range(len(text))]
model.eval()
state_h, state_c = model.init_state(len(words))
for i in range(0, next_words):
x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
last_word_logits = y_pred[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
word_index = np.random.choice(len(last_word_logits), p=p)
words.append(dataset.index_to_word[word_index])
return trim_smiles("".join(words))
trim_smiles 関数
与えた文字列のうち、SMILES として成立する部分だけを切り出す関数です。
from rdkit import Chem
def trim_smiles(smile):
mol = Chem.MolFromSmiles(smile)
while not mol:
if len(smile) == 0: break
smile = smile[:-1]
mol = Chem.MolFromSmiles(smile)
return smile
学習開始
dataset = Dataset(url=URL, smiles_col=SMILES_COL)
model = LSTM_Generator(dataset)
losses = train(dataset, model)
Epoch: 1, Loss: 278.372, Generated SMILES: NCCC
Epoch: 2, Loss: 195.260, Generated SMILES: O=NO
Epoch: 3, Loss: 157.259, Generated SMILES: C=C1CCc1
Epoch: 4, Loss: 140.553, Generated SMILES: Cc1Nc1
Epoch: 5, Loss: 129.166, Generated SMILES: CCNC
Epoch: 6, Loss: 120.669, Generated SMILES: C/N(C)C(=O)C
Epoch: 7, Loss: 114.117, Generated SMILES: N=CO
Epoch: 8, Loss: 109.073, Generated SMILES: C/NO
Epoch: 9, Loss: 104.497, Generated SMILES: CCc2c1nncc2COC(=O)[C@@H](CC(=O)C)/NNC(=N1)(C)
Epoch: 10, Loss: 100.387, Generated SMILES: CCC/N
Epoch: 11, Loss: 96.993, Generated SMILES: OC(S)(O)
Epoch: 12, Loss: 94.068, Generated SMILES: CN(nO1)cccc1CCN[C@H](C1=O)CCc1
Epoch: 13, Loss: 90.858, Generated SMILES: N=C\CCC/N=C(\Oc1cc(C)ccc1C)OCc1occ(c(=O)c1OC)C(=O)OCNC1=C(O)C(=O)OC1=C(C)C(=O)O
Epoch: 14, Loss: 88.373, Generated SMILES: O=C(C)C(=O)O
Epoch: 15, Loss: 86.386, Generated SMILES: CCO
Epoch: 16, Loss: 84.308, Generated SMILES: C=CC
Epoch: 17, Loss: 81.482, Generated SMILES: N#CCNc1ccc(c(c1)C)/OC
Epoch: 18, Loss: 78.523, Generated SMILES: O=N(=C/C(C)C)/N
Epoch: 19, Loss: 75.753, Generated SMILES: Cc1=NC(=C(C(=O)N1)[N][N][C@H](CC)C(=O)O)CCOc1ccc(c(c1)C)C(=O)CCC/N=C
Epoch: 20, Loss: 73.682, Generated SMILES: CC[C@]S
Epoch: 21, Loss: 71.387, Generated SMILES: CCc1c(O)cc2cc1CCC(=O)c1cc(O)c(c(c1)O)OCOc1cc(C)c(c(c1C)O)OCOc1cc(cc(c1)OCC2)C(C)CC/N=C
Epoch: 22, Loss: 69.764, Generated SMILES: O=C=C
Epoch: 23, Loss: 67.860, Generated SMILES: CCO
Epoch: 24, Loss: 66.114, Generated SMILES: COCCCC
Epoch: 25, Loss: 63.806, Generated SMILES: COc1(C)CCC1(C)C(=O)N(C1=O)CCc1
Epoch: 26, Loss: 63.020, Generated SMILES: CN(CCCOc1c(C)cccc1C)CC[C@@H](C1=O)OOCc1
Epoch: 27, Loss: 62.105, Generated SMILES: CC(CCCCN1C(=O)C)CCc1
Epoch: 28, Loss: 59.576, Generated SMILES: O=NNC
Epoch: 29, Loss: 58.116, Generated SMILES: CNC\CCc1n(C)cnc1c2cc1oc(n2)ccc2Cc2c1
Epoch: 30, Loss: 55.332, Generated SMILES: CCC[CH]/C
Epoch: 31, Loss: 52.434, Generated SMILES: CC(CC=C/C(=N\CC=C)/Nc1ccc(nc1C)N)
Epoch: 32, Loss: 50.242, Generated SMILES: CCCCN(CCc1c(CO)n(c(n1)O)C)OC
Epoch: 33, Loss: 47.927, Generated SMILES: CNCCN
Epoch: 34, Loss: 46.725, Generated SMILES: CCOCCC(n1cc2c(c1)c(C)ccn2)OC
Epoch: 35, Loss: 45.368, Generated SMILES: COc(c1)c(C)c(=O)n2CCc1cc(C)c(c(c1)C)OOCc1cc(C)2
Epoch: 36, Loss: 43.739, Generated SMILES: CC(CCCN1C(=O)C)CCCCC[CH]1C1=C[N]C(=C/CN(C)C)[NH]c1
Epoch: 37, Loss: 42.778, Generated SMILES: C/CN
Epoch: 38, Loss: 41.916, Generated SMILES: C=CN(C=C/C/C(=N/C)/O)
Epoch: 39, Loss: 39.551, Generated SMILES: C=CCN1C(=CC(=C[C@@H](C1=O)C)C)C
Epoch: 40, Loss: 38.435, Generated SMILES: O=NCOc1cc(ccc1C(C)(C)C)OCc1c(nn2oc1cccc2)CC[C@H](Cc1cc(c(c(c1)O)N(C)C)C)/OOCCc1c(CO)ccc2c1cccc2CO
Epoch: 41, Loss: 36.081, Generated SMILES: CCO
Epoch: 42, Loss: 34.733, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)C
Epoch: 43, Loss: 32.478, Generated SMILES: O=c1c(C)ccc2C[C@H](CC)n1cncncc1NCCc1cc(C)ccc1CCc1cc(OC)c(c(c1)OC)OCCc1ccc2
Epoch: 44, Loss: 31.013, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCCN(c1cc(C)c(cc1C)C)CCc1cc(C)ccc1CCc1cc(C)cc(c1)CC[C@H]
Epoch: 45, Loss: 31.244, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1cc(CC)cc(c1)CCOc1cc(ccc1C)[C@H](CNC(C)(C)C)OCCc1c(C)cc(cc1C)CC
Epoch: 46, Loss: 30.788, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCC
Epoch: 47, Loss: 29.778, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)c(C)c(c2c1cccc2)CCc1cc(C)cc(c1)NC1=CCCC1CC
Epoch: 48, Loss: 28.512, Generated SMILES: N#CCCN1C(=CC(=C[C@H](C1=CC(=O)C1=O)C)O)CCCCCOC(=O)c1
Epoch: 49, Loss: 26.879, Generated SMILES: CC(=C/CC(=O)C)
Epoch: 50, Loss: 25.381, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cccc1CCC[C@H](c1ccc(c(c1)N(C)C)C)/OCc1cc(c(c(c1)C)O)CN/C
Epoch: 51, Loss: 24.776, Generated SMILES: N#CCCN1C(=CC(=O)C1=C(CC[C@H]c1c(C)ccc(c1O)C)CC)CC
Epoch: 52, Loss: 21.878, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(c2c1cccc2)CCCc1cc(C)cc(c1)CC[C@@H](O[N])C
Epoch: 53, Loss: 20.812, Generated SMILES: C/CCC\CCOn1cc(N)c(c1)CC[C@H](Cn1cnc2c1c(=O)n(C)c(=O)n2C)ON
Epoch: 54, Loss: 20.241, Generated SMILES: CNCCN1C(=CC(=C[C@@H](C1=O)C)C)C
Epoch: 55, Loss: 20.249, Generated SMILES: CCCCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCCCCc1cc(CC)cc(c1)CNCC[C@@H]1CCc2c(C1=O)c(O)cc(c2)OCC
Epoch: 56, Loss: 20.007, Generated SMILES: OCCCCOC(=O)c1c(C)cc(cc1C)CN(C)CC[C@H]1Cc2c(O1)c(C)ccc2C/C1=N[C](C)N/C(=N/C(C)C)/N1
Epoch: 57, Loss: 18.044, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCCOCCO
Epoch: 58, Loss: 16.840, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)COCc1cocc2cc1OC)cccc2OCC[C@H](C1)[CH](C)CCn1
Epoch: 59, Loss: 14.486, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCCCCOc1c(C)cc(cc1C)C(=O)OCCc1cncc(c1)CCCc1cc(C)c(c(c1)C)OO/C
Epoch: 60, Loss: 14.156, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(c(c1C)N)OOCc1ccc(c(c1)C)CCc1ccc2c(c1)C(=O)N(C)C2=CCC1CCc1
Epoch: 61, Loss: 13.321, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(CC)occ1CCCOC(=O)c1cc(c(c(c1)C)C(C)(C)C)OOc1ccc2c(c1)c(C)oc2CC
Epoch: 62, Loss: 12.808, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(c2c1cccc2)CCCO[C@H]1OC(=C(C1=O)O)CCCOC(=O)
Epoch: 63, Loss: 12.618, Generated SMILES: COCOc1cc(C)c2c(c1)CC[C@H]2CCc1ccc(c(c1)C)NC1=NCCO1
Epoch: 64, Loss: 13.043, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)OCc1cc(C)c(c(c1C)O)OCc1ccc(c(c1)C)CC/N=C
Epoch: 65, Loss: 12.034, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)ccc1CO
Epoch: 66, Loss: 11.841, Generated SMILES: N#CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(CC)cc(cc1C)CCc1cc(C)cc(c1)CCOc1cc(ccc1O)C(=O)OCCN(C)CCO
Epoch: 67, Loss: 11.939, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCCc1c(C)cc(cc1C)NCCOC1=CC(=O)C(=CC1=O)SCO
Epoch: 68, Loss: 11.629, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)coc1COC[C@@H]1CCCc1
Epoch: 69, Loss: 11.832, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCCOc1ccc2c(c1)C(=CC(N1)(C)C)CCN1C(=O)C=C(C1=O)CCNCC1Oc1nc2c([nH]1)
Epoch: 70, Loss: 11.349, Generated SMILES: O=CN(CCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cccc(c1O)CCN(Cc1ccc2c(c1)c(=O)c(c(o2)C)C)CC
Epoch: 71, Loss: 10.082, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cccc1CO
Epoch: 72, Loss: 11.135, Generated SMILES: CCN(CCCN1C(=CCCS1)/C=CC)COc1c(C)cccc1COCc1cc(cc(c1O)OC)C(=O)OCO
Epoch: 73, Loss: 9.799, Generated SMILES: COc1c(C)ccc2CCN([C@H](c2cc1O)C)CNCCc1ccc(cc1C)CC/N=C/Nc1ccc(cc1C)O
Epoch: 74, Loss: 9.292, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(CC)cc(c1)CCc1ccc(c(c1)C)CC/C=C/c1c(C)cccc1
Epoch: 75, Loss: 8.955, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cccc2c1[nH]c1c2CCNCC1Cc1cc(c(c(c1)C(C)(C)C)O)
Epoch: 76, Loss: 8.708, Generated SMILES: COc1c(C)ccc2CCC[C@H](c1ccc(c(c1)[C@H](CC)C)O)CNc1cc(C)c(c(c1)N(=O)=O)OCn1c2
Epoch: 77, Loss: 9.119, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1)N
Epoch: 78, Loss: 8.775, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COC[C@H]
Epoch: 79, Loss: 7.541, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)ccc1CO
Epoch: 80, Loss: 6.776, Generated SMILES: NCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CN(C)Cc1cccc(c1C)/N=C
Epoch: 81, Loss: 6.652, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)CCCCCOc1cc(CC)cc(c1)OCO
Epoch: 82, Loss: 6.518, Generated SMILES: N#CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)(C)CCC[C@H]
Epoch: 83, Loss: 5.457, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1)OCc1c(C)cccc1NCCc1nc(C)cnc1CCC
Epoch: 84, Loss: 5.171, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 85, Loss: 5.428, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)CC/C=C
Epoch: 86, Loss: 6.426, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COc1c(C)cc(cc1C)COC[C@@H]1C(=O)OC[C@@H]1Cc1c[nH]c2c1cccc2CCC[N]C
Epoch: 87, Loss: 7.806, Generated SMILES: COc1c(C)ccc2CC=CCCC(c1cccc(c1O)C(C)(C)C)(C)CNC(=O)Nc1ccc(c(c1)C)CCCc1c(C)cccc1CCCCOC(=O)Cc1cc2
Epoch: 88, Loss: 7.322, Generated SMILES: O=CSC
Epoch: 89, Loss: 7.362, Generated SMILES: O=CCCN
Epoch: 90, Loss: 6.118, Generated SMILES: CCOCCOc1ccc(c2c1OC(O2)(C)C)/OC
Epoch: 91, Loss: 5.924, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCCNC/C(=N\c1c(C)cccc1C)/OCC
Epoch: 92, Loss: 4.868, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(CC)cc(cc1C)CCc1cc(C)cc(c1)CCC(c1ccc(c(c1)C)C)CC/N=C
Epoch: 93, Loss: 4.298, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(c2c1cccc2)CCCOC1=CC(=O)C(=CC1=O)[C@H](C(=O)N(C)O)CCC(=O)
Epoch: 94, Loss: 3.108, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COSc1cc(O)c(c(c1)O)OCC1=C(C)CCC1=O
Epoch: 95, Loss: 2.951, Generated SMILES: NNCN(C=Cc1c(C)cccc1C)CCC(=O)c1c(O)cc(c(c1O)C)OOc1cc(O)c(c(c1)O)OC=Cc1ccc(cc1C)CCC(=O)c1c(C)[nH]c(c1C)CC
Epoch: 96, Loss: 2.799, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C/1=C\COc1
Epoch: 97, Loss: 3.307, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COC
Epoch: 98, Loss: 3.384, Generated SMILES: CCCN(CCC=C1)/CC=CC(=O)C(=C1)CCN(c1ccc(c2c1non2)N(=O)=O)CCOc1ccc(c(c1OC)OC)C(=O)OCOC1=C(C)C(=O)CC1=O
Epoch: 99, Loss: 4.780, Generated SMILES: COCCCN
Epoch: 100, Loss: 5.356, Generated SMILES: C/CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)CC/N=C
Epoch: 101, Loss: 6.936, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc2CCOc2cc1OCCCc1cc(C)cc(c1)CCCOc1cc(ccc1OC)/C
Epoch: 102, Loss: 7.216, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1)CCC[N]C1=N[C](C)N/C(=N/CC)/N1C=CCC
Epoch: 103, Loss: 5.853, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(CC)cc(cc1CC)C(=O)OCc1cc(C)c(c(c1)C)NC[C]1NC(=NC(=C1)C)[N]C(C)C
Epoch: 104, Loss: 5.410, Generated SMILES: O=CCCN1C(=CCCCC(=CC(=C[C@@H](C1=O)C)C)O)COCc1c(C)cc(cc1C)CO
Epoch: 105, Loss: 4.589, Generated SMILES: N#CCCN1C(=CC(=C[C@@H](C1=O)C)C)C
Epoch: 106, Loss: 4.229, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(CC)cc(c1)OCOc1ccc(c2c1cccc2C)CCCOC(=O)
Epoch: 107, Loss: 2.410, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COSc1ccc2c(c1)c1CCNCCc1[nH]2
Epoch: 108, Loss: 1.846, Generated SMILES: CCc1c(C)ccc2CC=C[C]1N/C(=N/C(C)C)/NC(=N1)[N]C(C)CCOc1cc(C[C@@H](N)C)cc2
Epoch: 109, Loss: 1.609, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1OC)OC)C(=N)OC/C
Epoch: 110, Loss: 1.255, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 111, Loss: 0.996, Generated SMILES: N=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 112, Loss: 0.709, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 113, Loss: 0.573, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 114, Loss: 0.420, Generated SMILES: OCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 115, Loss: 0.395, Generated SMILES: CNCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 116, Loss: 0.450, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 117, Loss: 1.436, Generated SMILES: CCc1c(C)ccc2CC[C@H]1CC[C@H](O1)n1cnc2
Epoch: 118, Loss: 11.084, Generated SMILES: CCCN(Cc1c(C)cccc1C)CO/C=C(\Oc1ccc(c(c1)C)C(C)C)CCc1cc2c(cc1C)C[C@H](C)CCC[C@H]2CCc1cc(C)c2c(c1C)[nH]cc2
Epoch: 119, Loss: 42.172, Generated SMILES: O=CCCN1C(=CC(=C[C@@H](C1=O)c1c(=O)n(C)cc2c1cccc2)CC)C(C)CCCC(=O)c1ccc(c(c1)OC)OCC[C]1NC(=NC(=N1)C)[NH]C
Epoch: 120, Loss: 18.410, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COC(=O)c1cccc(c1C)C(=O)OC[C]1NC(=NC(=N1)C)[NH]C
Epoch: 121, Loss: 6.812, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1C)C)C)C)CCN
Epoch: 122, Loss: 2.959, Generated SMILES: O/CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1OC)OC)C(=O)OCC(=O)
Epoch: 123, Loss: 1.696, Generated SMILES: OCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)CCCc1c(C)cc(cc1C)COCc1cc(C)cc(c1)CCC[N]C1=N[C](N/C(=N/CC=C)/N1)SCCO
Epoch: 124, Loss: 0.955, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 125, Loss: 0.794, Generated SMILES: COCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 126, Loss: 0.538, Generated SMILES: CCCN1CCc2c(C1)c(C)ccc2CCc1ccc(cc1CC#N)CCC
Epoch: 127, Loss: 0.425, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 128, Loss: 0.368, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 129, Loss: 0.333, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 130, Loss: 0.315, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 131, Loss: 0.307, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 132, Loss: 0.240, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 133, Loss: 0.210, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 134, Loss: 0.199, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 135, Loss: 0.185, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1N(=O)=O)CCCNC(=O)c1ccc(c(c1)C)CCC
Epoch: 136, Loss: 0.169, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 137, Loss: 0.151, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 138, Loss: 0.139, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 139, Loss: 0.128, Generated SMILES: NNCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 140, Loss: 0.118, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 141, Loss: 0.110, Generated SMILES: COCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 142, Loss: 0.103, Generated SMILES: O/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 143, Loss: 0.097, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 144, Loss: 0.091, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 145, Loss: 0.085, Generated SMILES: COc1OC(C1)(C)CCc1c(C)cc(cc1C)CCc1cc(C)cc(c1)CCC[N]C1=N[C](N/C(=N/CC=C)/N1)SCCOc1c(OC)ccc2c1CC[C@H](C2)N
Epoch: 146, Loss: 0.080, Generated SMILES: NNCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 147, Loss: 0.076, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 148, Loss: 0.071, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 149, Loss: 0.067, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 150, Loss: 0.063, Generated SMILES: NCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 151, Loss: 0.060, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 152, Loss: 0.057, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 153, Loss: 0.053, Generated SMILES: NCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 154, Loss: 0.050, Generated SMILES: CCOCCOc1cc(cc(c1OC)OC)C(=O)OC[C@@H](CC(=O)O)Cc1cc(C)ccc1COC(=O)c1c(C)cccc1CO
Epoch: 155, Loss: 0.048, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 156, Loss: 0.045, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 157, Loss: 0.043, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 158, Loss: 0.040, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 159, Loss: 0.038, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 160, Loss: 0.036, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 161, Loss: 0.034, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 162, Loss: 0.032, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 163, Loss: 0.031, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 164, Loss: 0.030, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 165, Loss: 0.029, Generated SMILES: COCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 166, Loss: 0.028, Generated SMILES: COCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 167, Loss: 0.026, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 168, Loss: 0.025, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 169, Loss: 0.023, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 170, Loss: 0.022, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 171, Loss: 0.021, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 172, Loss: 0.020, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 173, Loss: 0.019, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 174, Loss: 0.018, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 175, Loss: 0.018, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 176, Loss: 0.017, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 177, Loss: 0.016, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 178, Loss: 0.015, Generated SMILES: C/CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 179, Loss: 0.015, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 180, Loss: 0.014, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 181, Loss: 0.013, Generated SMILES: OC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 182, Loss: 0.013, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 183, Loss: 0.012, Generated SMILES: O=N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)
Epoch: 184, Loss: 0.012, Generated SMILES: C/CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 185, Loss: 0.011, Generated SMILES: NNCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 186, Loss: 0.011, Generated SMILES: OCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 187, Loss: 0.010, Generated SMILES: CC(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 188, Loss: 0.010, Generated SMILES: C=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 189, Loss: 0.010, Generated SMILES: COCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 190, Loss: 0.009, Generated SMILES: NNCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 191, Loss: 0.009, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 192, Loss: 0.009, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 193, Loss: 0.008, Generated SMILES: CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 194, Loss: 0.008, Generated SMILES: O=CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 195, Loss: 0.008, Generated SMILES: N#CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 196, Loss: 0.008, Generated SMILES: CCCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 197, Loss: 0.007, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 198, Loss: 0.007, Generated SMILES: N#CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 199, Loss: 0.007, Generated SMILES: CCN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
Epoch: 200, Loss: 0.007, Generated SMILES: C/N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)COCc1cc(C)cc(c1O)COOc1ccc(c(c1)C)C(C)CC/C
最初のうちは短く単純な SMILES しか生成しなかったのが、だんだん長く複雑な SMILES を生成するようになり、やがて同じような SMILES しか生成しなくなるような感じがします(主観)。
学習曲線
Loss 自体はまだ下がりそうな気がしますが、生成する分子の多様性をしっかり評価して、多様性が下がらないうちに打ち切るのが良さそうな気がしますね。
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))
axes[0].plot(losses, label="Loss")
axes[0].grid()
axes[0].set_xlabel("epoch")
axes[0].legend()
axes[1].plot(losses, label="Loss")
axes[1].grid()
axes[1].set_yscale('log')
axes[1].set_xlabel("epoch")
axes[1].legend()
plt.show()
分子生成例
次のようにして、最初のいくつかの文字列を指定して生成できます。何度も呼び出せばその度に出力は(ある程度)変わります。
predict(dataset, model, text='C')
'CN(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)'
predict(dataset, model, text='c')
'c1c(C)ccc2CC[C@H]1CC=Cc2'
predict(dataset, model, text='N')
'N(CCCN1C(=CC(=C[C@@H](C1=O)C)C)C)COCc1c(C)cc(cc1C)C'
predict(dataset, model, text='O')
'OCCOc1cc(cc(c1OC)OC)C(=O)OC[C@@H](CC(=O)O)C'