※この記事のコードは "Structure Generator based on R-Group (SGRG)" (URL: https://github.com/hkaneko1985/structure_generator_based_on_r_group )のコード(MITライセンス)を流用しています。
#はじめに
「Structure Generator based on R-Group (SGRG) を使って何ができますか?」(https://datachemeng.com/how_to_use_sgrg/ )では、「主骨格はそのままで、側鎖を変えた化学構造を生成する」プログラムが公開されている。大変素晴らしい。
ただ、全Rグループに、同じ側鎖群しか指定することができないようである。例えば、Rグループとして、R1とR2を定義し、それぞれに異なる側鎖群を指定することができないようだ。
そこで今回は、上で公開されているプログラムのうち, "structure_generator_based_on_r_group_all_combinations.py" を改変し、R1とR2にそれぞれ別の側鎖群を定義したsmiファイル(側鎖群がSMILESで列挙されたもの)を指定し、R1, R2それぞれ自身の側鎖群の側鎖しか生成されないようにしてみた。
#ソース
RDKitを駆使しまくっており、相当難易度の高いプログラムであったが、何とか意図の動作になるよう改変してみた。
ただ、Rグループは"R1", "R2"に限定されており、汎用性はまだ低い。
import numpy as np
from numpy import matlib
from rdkit import Chem
bond_list = [Chem.rdchem.BondType.UNSPECIFIED, Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.QUADRUPLE, Chem.rdchem.BondType.QUINTUPLE,
Chem.rdchem.BondType.HEXTUPLE, Chem.rdchem.BondType.ONEANDAHALF, Chem.rdchem.BondType.TWOANDAHALF,
Chem.rdchem.BondType.THREEANDAHALF, Chem.rdchem.BondType.FOURANDAHALF, Chem.rdchem.BondType.FIVEANDAHALF,
Chem.rdchem.BondType.AROMATIC, Chem.rdchem.BondType.IONIC, Chem.rdchem.BondType.HYDROGEN,
Chem.rdchem.BondType.THREECENTER, Chem.rdchem.BondType.DATIVEONE, Chem.rdchem.BondType.DATIVE,
Chem.rdchem.BondType.DATIVEL, Chem.rdchem.BondType.DATIVER, Chem.rdchem.BondType.OTHER,
Chem.rdchem.BondType.ZERO]
# 主骨格
main_molecule = Chem.MolFromMolFile('main.sdf')
#R1 用側鎖
fragment_molecules = [molecule for molecule in Chem.SDMolSupplier('fragments1.sdf') if molecule is not None]
#R2 用側鎖
fragment_molecules2 = [molecule for molecule in Chem.SDMolSupplier('fragments2.sdf') if molecule is not None]
number_of_fragment_molecules = len(fragment_molecules)
number_of_fragment_molecules2 = len(fragment_molecules2)
main_adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(main_molecule)
for bond in main_molecule.GetBonds():
main_adjacency_matrix[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] = bond_list.index(bond.GetBondType())
main_adjacency_matrix[bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()] = bond_list.index(bond.GetBondType())
main_atoms = []
for atom in main_molecule.GetAtoms():
print(atom.GetSymbol())
main_atoms.append(atom.GetSymbol())
#ここをR1, R2に変更
#r_index_in_main_molecule_old = [index for index, atom in enumerate(main_atoms) if atom == '*']
r_index_in_main_molecule_old = [index for index, atom in enumerate(main_atoms) if atom == 'R1' or atom == 'R2']
for index, r_index in enumerate(r_index_in_main_molecule_old):
modified_index = r_index - index
atom = main_atoms.pop(modified_index)
main_atoms.append(atom)
tmp = main_adjacency_matrix[:, modified_index:modified_index + 1].copy()
main_adjacency_matrix = np.delete(main_adjacency_matrix, modified_index, 1)
main_adjacency_matrix = np.c_[main_adjacency_matrix, tmp]
tmp = main_adjacency_matrix[modified_index:modified_index + 1, :].copy()
main_adjacency_matrix = np.delete(main_adjacency_matrix, modified_index, 0)
main_adjacency_matrix = np.r_[main_adjacency_matrix, tmp]
#ここをR1, R2に変更
#r_index_in_main_molecule_new = [index for index, atom in enumerate(main_atoms) if atom == '*']
r_index_in_main_molecule_new = [index for index, atom in enumerate(main_atoms) if atom == 'R1' or atom == 'R2']
r_bonded_atom_index_in_main_molecule = []
for number in r_index_in_main_molecule_new:
r_bonded_atom_index_in_main_molecule.append(np.where(main_adjacency_matrix[number, :] != 0)[0][0])
r_bond_number_in_main_molecule = main_adjacency_matrix[
r_index_in_main_molecule_new, r_bonded_atom_index_in_main_molecule]
main_adjacency_matrix = np.delete(main_adjacency_matrix, r_index_in_main_molecule_new, 0)
main_adjacency_matrix = np.delete(main_adjacency_matrix, r_index_in_main_molecule_new, 1)
for i in range(len(r_index_in_main_molecule_new)):
#ここをR1, R2に変更
#main_atoms.remove('*')
try:
main_atoms.remove('R1')
main_atoms.remove('R2')
except:
pass
main_size = main_adjacency_matrix.shape[0]
all_fragment_combinations = np.arange(number_of_fragment_molecules)
all_fragment_combinations = np.reshape(all_fragment_combinations, (all_fragment_combinations.shape[0], 1))
#ここを修正
#if len(r_index_in_main_molecule_new) > 1:
# for variable_number in range(2, len(r_index_in_main_molecule_new) + 1):
# grid_seed_tmp = matlib.repmat(np.arange(number_of_fragment_molecules), #all_fragment_combinations.shape[0], 1)
# all_fragment_combinations = np.c_[
# matlib.repmat(all_fragment_combinations, #len(np.arange(number_of_fragment_molecules)), 1),
# np.reshape(grid_seed_tmp.T, (np.prod(grid_seed_tmp.shape), 1))]
grid_seed_tmp = matlib.repmat(np.arange(number_of_fragment_molecules2), all_fragment_combinations.shape[0], 1)
all_fragment_combinations = np.c_[
matlib.repmat(all_fragment_combinations, len(np.arange(number_of_fragment_molecules2)), 1),
np.reshape(grid_seed_tmp.T, (np.prod(grid_seed_tmp.shape), 1))]
generated_structures = []
for generated_structure_number in range(all_fragment_combinations.shape[0]):
generated_molecule_atoms = main_atoms[:]
generated_adjacency_matrix = main_adjacency_matrix.copy()
for r_number_in_molecule in range(len(r_index_in_main_molecule_new)):
#ここを1R1, R2に対応
#fragment_molecule = fragment_molecules[
# all_fragment_combinations[generated_structure_number, r_number_in_molecule]]
if r_number_in_molecule == 0:
fragment_molecule = fragment_molecules[
all_fragment_combinations[generated_structure_number, r_number_in_molecule]]
else:
fragment_molecule = fragment_molecules2[
all_fragment_combinations[generated_structure_number, r_number_in_molecule]]
fragment_adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(fragment_molecule)
for bond in fragment_molecule.GetBonds():
fragment_adjacency_matrix[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] = bond_list.index(
bond.GetBondType())
fragment_adjacency_matrix[bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()] = bond_list.index(
bond.GetBondType())
fragment_atoms = []
for atom in fragment_molecule.GetAtoms():
fragment_atoms.append(atom.GetSymbol())
# integrate adjacency matrix
r_index_in_fragment_molecule = fragment_atoms.index('*')
r_bonded_atom_index_in_fragment_molecule = \
np.where(fragment_adjacency_matrix[r_index_in_fragment_molecule, :] != 0)[0][0]
r_bond_number_in_fragment_molecule = fragment_adjacency_matrix[
r_index_in_fragment_molecule, r_bonded_atom_index_in_fragment_molecule]
if r_bonded_atom_index_in_fragment_molecule > r_index_in_fragment_molecule:
r_bonded_atom_index_in_fragment_molecule -= 1
fragment_atoms.remove('*')
fragment_adjacency_matrix = np.delete(fragment_adjacency_matrix, r_index_in_fragment_molecule, 0)
fragment_adjacency_matrix = np.delete(fragment_adjacency_matrix, r_index_in_fragment_molecule, 1)
fragment_size = fragment_adjacency_matrix.shape[0]
main_size = generated_adjacency_matrix.shape[0]
generated_adjacency_matrix = np.c_[generated_adjacency_matrix, np.zeros(
[generated_adjacency_matrix.shape[0], fragment_adjacency_matrix.shape[0]], dtype='int32')]
generated_adjacency_matrix = np.r_[generated_adjacency_matrix, np.zeros(
[fragment_adjacency_matrix.shape[0], generated_adjacency_matrix.shape[1]], dtype='int32')]
generated_adjacency_matrix[
r_bonded_atom_index_in_main_molecule[r_number_in_molecule],
r_bonded_atom_index_in_fragment_molecule + main_size] = \
r_bond_number_in_main_molecule[r_number_in_molecule]
generated_adjacency_matrix[
r_bonded_atom_index_in_fragment_molecule + main_size,
r_bonded_atom_index_in_main_molecule[ r_number_in_molecule]] = r_bond_number_in_main_molecule[r_number_in_molecule]
generated_adjacency_matrix[main_size:, main_size:] = fragment_adjacency_matrix
# integrate atoms
generated_molecule_atoms += fragment_atoms
# generate structures
generated_molecule = Chem.RWMol()
atom_index = []
for atom_number in range(len(generated_molecule_atoms)):
atom = Chem.Atom(generated_molecule_atoms[atom_number])
molecular_index = generated_molecule.AddAtom(atom)
atom_index.append(molecular_index)
for index_x, row_vector in enumerate(generated_adjacency_matrix):
for index_y, bond in enumerate(row_vector):
if index_y <= index_x:
continue
if bond == 0:
continue
else:
generated_molecule.AddBond(atom_index[index_x], atom_index[index_y], bond_list[bond])
generated_molecule = generated_molecule.GetMol()
generated_structures.append(Chem.MolToSmiles(generated_molecule))
if (generated_structure_number + 1) % 1000 == 0 or (generated_structure_number + 1) == \
all_fragment_combinations.shape[0]:
print(generated_structure_number + 1, '/', all_fragment_combinations.shape[0])
str_ = '\n'.join(generated_structures)
with open('generated_structures.smi', 'wt') as writer:
writer.write(str_)
writer.close()
#最後に
Rグループ毎に、異なるsmiファイルを指定できるよう汎用化してみたい。
今回は、元のソースから化合物を編集する際のRDkitの新しい使い方も勉強させていただいた。この使い方で何ができるか考えて見たい。