特定の受容体のリガンドデータSMILESもしくは適当な類似SMILESをLigands,SMILES形式にしてcsvを用意する。
input()でcsvとmodelを入力する。
import pandas as pd
from rdkit import Chem
import subprocess
import os
from datetime import datetime
import concurrent.futures
import numpy as np
from rdkit.Chem import AllChem, rdMolDescriptors
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
def smiles_to_vector(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
raise ValueError(f"Invalid SMILES string: {smiles}")
return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))
def generate_smiles(model, length, start_token=None, existing_smiles=set(), ligand_data=None):
if start_token:
generated_smiles = start_token
else:
generated_smiles = ""
for _ in range(length):
try:
x_pred = smiles_to_vector(generated_smiles)
except ValueError as e:
print(e)
continue # 無効なSMILESが生成された場合はスキップして次に進む
x_pred = np.reshape(x_pred, (1, 2048, 1))
if model:
next_char_probs = model.predict(x_pred, verbose=0)[0]
next_char_idx = np.random.choice(len(next_char_probs), p=next_char_probs)
else:
next_char_idx = np.random.choice(len(ligand_data['SMILES'])) # ランダムに選択
# インデックスから対応するSMILES文字を取得
try:
next_char_smiles = ligand_data['SMILES'][next_char_idx]
except IndexError:
continue # 無効なインデックスが生成された場合はスキップして次に進む
# 生成された文字をSMILESに追加
generated_smiles += next_char_smiles
# 生成されたSMILESが有効かどうかを確認
mol = Chem.MolFromSmiles(generated_smiles)
if mol is None or generated_smiles in existing_smiles:
if start_token:
generated_smiles = start_token # 無効なSMILESが生成された場合は初期状態に戻す
else:
generated_smiles = "" # STEMがない場合は空の文字列に戻す
continue
return generated_smiles
def generate_smiles_multithreaded(model, attempts, length, start_token=None, max_threads=4, ligand_data=None):
generated_smiles_list = []
existing_smiles = set()
def worker():
for _ in range(attempts // max_threads):
generated_smiles = generate_smiles(model, length, start_token=start_token, existing_smiles=existing_smiles, ligand_data=ligand_data)
mol = Chem.MolFromSmiles(generated_smiles)
if mol and rdMolDescriptors.CalcExactMolWt(mol) < 600: # 分子量制限を600に増やす
if generated_smiles != start_token:
if generated_smiles not in existing_smiles:
generated_smiles_list.append(generated_smiles)
existing_smiles.add(generated_smiles)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = [executor.submit(worker) for _ in range(max_threads)]
concurrent.futures.wait(futures)
return generated_smiles_list
def train_model(ligand_data):
# SMILESをベクトルに変換
X = np.array([smiles_to_vector(smiles) for smiles in ligand_data['SMILES']])
# LSTMモデルの構築
model = Sequential([
LSTM(128, input_shape=(2048, 1), return_sequences=True),
LSTM(128),
Dense(len(ligand_data['SMILES']), activation='softmax') # 出力を確率分布として扱うためにsoftmaxを使用
])
model.compile(optimizer='adam', loss='categorical_crossentropy')
# データの形状を調整
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
# ターゲットデータの準備
y = np.eye(len(ligand_data['SMILES'])) # one-hot encoding
y = np.tile(y, (X.shape[0] // len(y) + 1, 1))[:X.shape[0]] # Xのサイズに合わせて繰り返す
# モデルの訓練
model.fit(X, y, epochs=10, batch_size=1)
# 学習済みモデルの保存
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
model.save(f'trained_model_{timestamp}.h5')
return model
def main():
# ユーザーからCSVファイルのパスを入力
input_csv = input("Enter the path to the input CSV file containing SMILES: ")
model_path = input("Enter the path to the trained model (or 'n' if no model): ")
# CSVファイルの読み込み
ligand_data_df = pd.read_csv(input_csv)
ligand_data = {
'Ligands': ligand_data_df['Ligands'].tolist(),
'SMILES': ligand_data_df['SMILES'].tolist()
}
# 学習済みモデルのロードまたは新規訓練
if model_path.lower() == 'n':
model = train_model(ligand_data)
else:
model = load_model(model_path)
# ユーザーからSTEM骨格(初期トークン)を入力
initial_smiles = input("Enter the initial SMILES (STEM structure, leave empty if no STEM): ")
# 最大50回の試行で有効なSMILESを生成(マルチスレッド)
generated_smiles_list = generate_smiles_multithreaded(model, 1000, 5, start_token=initial_smiles or None, max_threads=20, ligand_data=ligand_data)
if generated_smiles_list:
# 生成されたSMILESをCSVに保存
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_filename = f'generated_SMILES_{timestamp}.csv'
df = pd.DataFrame(generated_smiles_list, columns=["SMILES"])
df.to_csv(output_filename, index=False)
print(f"Generated SMILES saved to {output_filename}")
# CSVファイルの名前を基にしてSMILESからPDB変換
base_name = os.path.splitext(output_filename)[0]
output_dir = base_name
os.makedirs(output_dir, exist_ok=True)
for i, smiles in enumerate(generated_smiles_list):
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
sdf_filename = os.path.join(output_dir, f"{base_name}_molecule_{i+1}.sdf")
pdb_filename = os.path.join(output_dir, f"{base_name}_molecule_{i+1}.pdb")
# SDFへの出力
sdf_wtr = Chem.SDWriter(sdf_filename)
sdf_wtr.write(mol)
sdf_wtr.close()
# Open Babelを使用してSDFをPDBに変換
subprocess.run(["obabel", "-i", "sdf", sdf_filename, "-o", "pdb", "-O", pdb_filename])
else:
print(f"Invalid SMILES at row {i}: {smiles}")
# PDBファイルをPDBQTに変換
pdbqt_output_dir = os.path.join(output_dir, f"{timestamp}_pdbqt")
os.makedirs(pdbqt_output_dir, exist_ok=True)
for pdb_file in os.listdir(output_dir):
if pdb_file.endswith(".pdb"):
base_name = os.path.splitext(pdb_file)[0]
input_path = os.path.join(output_dir, pdb_file)
output_path = os.path.join(pdbqt_output_dir, base_name + ".pdbqt")
# Open Babelを使用して変換
subprocess.run(["obabel", input_path, "-O", output_path])
print("PDB to PDBQT conversion completed!")
else:
print("Failed to generate valid SMILES within the attempt limit")
if __name__ == "__main__":
main()
autodockでドッキングテストにかける。