added AI model

This commit is contained in:
Navan Chauhan 2020-07-31 21:23:03 +05:30
parent 376f04d1df
commit 61ce4e7b08
62 changed files with 681 additions and 0 deletions

1
app/lstm_chem/__init__.py Executable file
View File

@ -0,0 +1 @@

122
app/lstm_chem/data_loader.py Executable file
View File

@ -0,0 +1,122 @@
import json
import os
import numpy as np
from tqdm import tqdm
from tensorflow.keras.utils import Sequence
from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer
class DataLoader(Sequence):
def __init__(self, config, data_type='train'):
self.config = config
self.data_type = data_type
assert self.data_type in ['train', 'valid', 'finetune']
self.max_len = 0
if self.data_type == 'train':
self.smiles = self._load(self.config.data_filename)
elif self.data_type == 'finetune':
self.smiles = self._load(self.config.finetune_data_filename)
else:
pass
self.st = SmilesTokenizer()
self.one_hot_dict = self.st.one_hot_dict
self.tokenized_smiles = self._tokenize(self.smiles)
if self.data_type in ['train', 'valid']:
self.idx = np.arange(len(self.tokenized_smiles))
self.valid_size = int(
np.ceil(
len(self.tokenized_smiles) * self.config.validation_split))
np.random.seed(self.config.seed)
np.random.shuffle(self.idx)
def _set_data(self):
if self.data_type == 'train':
ret = [
self.tokenized_smiles[self.idx[i]]
for i in self.idx[self.valid_size:]
]
elif self.data_type == 'valid':
ret = [
self.tokenized_smiles[self.idx[i]]
for i in self.idx[:self.valid_size]
]
else:
ret = self.tokenized_smiles
return ret
def _load(self, data_filename):
length = self.config.data_length
print('loading SMILES...')
with open(data_filename) as f:
smiles = [s.rstrip() for s in f]
if length != 0:
smiles = smiles[:length]
print('done.')
return smiles
def _tokenize(self, smiles):
assert isinstance(smiles, list)
print('tokenizing SMILES...')
tokenized_smiles = [self.st.tokenize(smi) for smi in tqdm(smiles)]
if self.data_type == 'train':
for tokenized_smi in tokenized_smiles:
length = len(tokenized_smi)
if self.max_len < length:
self.max_len = length
self.config.train_smi_max_len = self.max_len
print('done.')
return tokenized_smiles
def __len__(self):
target_tokenized_smiles = self._set_data()
if self.data_type in ['train', 'valid']:
ret = int(
np.ceil(
len(target_tokenized_smiles) /
float(self.config.batch_size)))
else:
ret = int(
np.ceil(
len(target_tokenized_smiles) /
float(self.config.finetune_batch_size)))
return ret
def __getitem__(self, idx):
target_tokenized_smiles = self._set_data()
if self.data_type in ['train', 'valid']:
data = target_tokenized_smiles[idx *
self.config.batch_size:(idx + 1) *
self.config.batch_size]
else:
data = target_tokenized_smiles[idx *
self.config.finetune_batch_size:
(idx + 1) *
self.config.finetune_batch_size]
data = self._padding(data)
self.X, self.y = [], []
for tp_smi in data:
X = [self.one_hot_dict[symbol] for symbol in tp_smi[:-1]]
self.X.append(X)
y = [self.one_hot_dict[symbol] for symbol in tp_smi[1:]]
self.y.append(y)
self.X = np.array(self.X, dtype=np.float32)
self.y = np.array(self.y, dtype=np.float32)
return self.X, self.y
def _pad(self, tokenized_smi):
return ['G'] + tokenized_smi + ['E'] + [
'A' for _ in range(self.max_len - len(tokenized_smi))
]
def _padding(self, data):
padded_smiles = [self._pad(t_smi) for t_smi in data]
return padded_smiles

24
app/lstm_chem/finetuner.py Executable file
View File

@ -0,0 +1,24 @@
from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer
from lstm_chem.generator import LSTMChemGenerator
class LSTMChemFinetuner(LSTMChemGenerator):
def __init__(self, modeler, finetune_data_loader):
self.session = modeler.session
self.model = modeler.model
self.config = modeler.config
self.finetune_data_loader = finetune_data_loader
self.st = SmilesTokenizer()
def finetune(self):
self.model.compile(optimizer=self.config.optimizer,
loss='categorical_crossentropy')
history = self.model.fit_generator(
self.finetune_data_loader,
steps_per_epoch=self.finetune_data_loader.__len__(),
epochs=self.config.finetune_epochs,
verbose=self.config.verbose_training,
use_multiprocessing=True,
shuffle=True)
return history

44
app/lstm_chem/generator.py Executable file
View File

@ -0,0 +1,44 @@
from tqdm import tqdm
import numpy as np
from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer
class LSTMChemGenerator(object):
def __init__(self, modeler):
self.session = modeler.session
self.model = modeler.model
self.config = modeler.config
self.st = SmilesTokenizer()
def _generate(self, sequence):
while (sequence[-1] != 'E') and (len(self.st.tokenize(sequence)) <=
self.config.smiles_max_length):
x = self.st.one_hot_encode(self.st.tokenize(sequence))
preds = self.model.predict_on_batch(x)[0][-1]
next_idx = self.sample_with_temp(preds)
sequence += self.st.table[next_idx]
sequence = sequence[1:].rstrip('E')
return sequence
def sample_with_temp(self, preds):
streched = np.log(preds) / self.config.sampling_temp
streched_probs = np.exp(streched) / np.sum(np.exp(streched))
return np.random.choice(range(len(streched)), p=streched_probs)
def sample(self, num=1, start='G'):
sampled = []
if self.session == 'generate':
for _ in tqdm(range(num)):
sampled.append(self._generate(start))
return sampled
else:
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')
while len(sampled) < num:
sequence = self._generate(start)
mol = Chem.MolFromSmiles(sequence)
if mol is not None:
canon_smiles = Chem.MolToSmiles(mol)
sampled.append(canon_smiles)
return sampled

73
app/lstm_chem/model.py Executable file
View File

@ -0,0 +1,73 @@
import os
import time
from tensorflow.keras import Sequential
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.initializers import RandomNormal
from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer
class LSTMChem(object):
def __init__(self, config, session='train'):
assert session in ['train', 'generate', 'finetune'], \
'one of {train, generate, finetune}'
self.config = config
self.session = session
self.model = None
if self.session == 'train':
self.build_model()
else:
self.model = self.load(self.config.model_arch_filename,
self.config.model_weight_filename)
def build_model(self):
st = SmilesTokenizer()
n_table = len(st.table)
weight_init = RandomNormal(mean=0.0,
stddev=0.05,
seed=self.config.seed)
self.model = Sequential()
self.model.add(
LSTM(units=self.config.units,
input_shape=(None, n_table),
return_sequences=True,
kernel_initializer=weight_init,
dropout=0.3))
self.model.add(
LSTM(units=self.config.units,
input_shape=(None, n_table),
return_sequences=True,
kernel_initializer=weight_init,
dropout=0.5))
self.model.add(
Dense(units=n_table,
activation='softmax',
kernel_initializer=weight_init))
arch = self.model.to_json(indent=2)
self.config.model_arch_filename = os.path.join(self.config.exp_dir,
'model_arch.json')
with open(self.config.model_arch_filename, 'w') as f:
f.write(arch)
self.model.compile(optimizer=self.config.optimizer,
loss='categorical_crossentropy')
def save(self, checkpoint_path):
assert self.model, 'You have to build the model first.'
print('Saving model ...')
self.model.save_weights(checkpoint_path)
print('model saved.')
def load(self, model_arch_file, checkpoint_file):
print(f'Loading model architecture from {model_arch_file} ...')
with open(model_arch_file) as f:
model = model_from_json(f.read())
print(f'Loading model checkpoint from {checkpoint_file} ...')
model.load_weights(checkpoint_file)
print('Loaded the Model.')
return model

56
app/lstm_chem/trainer.py Executable file
View File

@ -0,0 +1,56 @@
from glob import glob
import os
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
class LSTMChemTrainer(object):
def __init__(self, modeler, train_data_loader, valid_data_loader):
self.model = modeler.model
self.config = modeler.config
self.train_data_loader = train_data_loader
self.valid_data_loader = valid_data_loader
self.callbacks = []
self.init_callbacks()
def init_callbacks(self):
self.callbacks.append(
ModelCheckpoint(
filepath=os.path.join(
self.config.checkpoint_dir,
'%s-{epoch:02d}-{val_loss:.2f}.hdf5' %
self.config.exp_name),
monitor=self.config.checkpoint_monitor,
mode=self.config.checkpoint_mode,
save_best_only=self.config.checkpoint_save_best_only,
save_weights_only=self.config.checkpoint_save_weights_only,
verbose=self.config.checkpoint_verbose,
))
self.callbacks.append(
TensorBoard(
log_dir=self.config.tensorboard_log_dir,
write_graph=self.config.tensorboard_write_graph,
))
def train(self):
history = self.model.fit_generator(
self.train_data_loader,
steps_per_epoch=self.train_data_loader.__len__(),
epochs=self.config.num_epochs,
verbose=self.config.verbose_training,
validation_data=self.valid_data_loader,
validation_steps=self.valid_data_loader.__len__(),
use_multiprocessing=True,
shuffle=True,
callbacks=self.callbacks)
last_weight_file = glob(
os.path.join(
f'{self.config.checkpoint_dir}',
f'{self.config.exp_name}-{self.config.num_epochs:02}*.hdf5')
)[0]
assert os.path.exists(last_weight_file)
self.config.model_weight_filename = last_weight_file
with open(os.path.join(self.config.exp_dir, 'config.json'), 'w') as f:
f.write(self.config.toJSON(indent=2))

26
app/lstm_chem/utils/config.py Executable file
View File

@ -0,0 +1,26 @@
import os
import time
import json
from bunch import Bunch
def get_config_from_json(json_file):
with open(json_file, 'r') as config_file:
config_dict = json.load(config_file)
config = Bunch(config_dict)
return config
def process_config(json_file):
config = get_config_from_json(json_file)
config.config_file = json_file
config.exp_dir = os.path.join(
'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
config.exp_name)
config.tensorboard_log_dir = os.path.join(
'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
config.exp_name, 'logs/')
config.checkpoint_dir = os.path.join(
'experiments', time.strftime('%Y-%m-%d/', time.localtime()),
config.exp_name, 'checkpoints/')
return config

12
app/lstm_chem/utils/dirs.py Executable file
View File

@ -0,0 +1,12 @@
import os
import sys
def create_dirs(dirs):
try:
for dir_ in dirs:
if not os.path.exists(dir_):
os.makedirs(dir_)
except Exception as err:
print(f'Creating directories error: {err}')
sys.exit()

View File

@ -0,0 +1,72 @@
import copy
import numpy as np
import time
class SmilesTokenizer(object):
def __init__(self):
atoms = [
'Li',
'Na',
'Al',
'Si',
'Cl',
'Sc',
'Zn',
'As',
'Se',
'Br',
'Sn',
'Te',
'Cn',
'H',
'B',
'C',
'N',
'O',
'F',
'P',
'S',
'K',
'V',
'I',
]
special = [
'(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
]
padding = ['G', 'A', 'E']
self.table = sorted(atoms, key=len, reverse=True) + special + padding
self.table_len = len(self.table)
self.one_hot_dict = {}
for i, symbol in enumerate(self.table):
vec = np.zeros(self.table_len, dtype=np.float32)
vec[i] = 1
self.one_hot_dict[symbol] = vec
def tokenize(self, smiles):
N = len(smiles)
i = 0
token = []
timeout = time.time() + 5 # 5 seconds from now
while (i < N):
for j in range(self.table_len):
symbol = self.table[j]
if symbol == smiles[i:i + len(symbol)]:
token.append(symbol)
i += len(symbol)
break
if time.time() > timeout:
break
return token
def one_hot_encode(self, tokenized_smiles):
result = np.array(
[self.one_hot_dict[symbol] for symbol in tokenized_smiles],
dtype=np.float32)
result = result.reshape(1, result.shape[0], result.shape[1])
return result

View File

@ -0,0 +1,72 @@
import copy
import numpy as np
import time
class SmilesTokenizer(object):
def __init__(self):
atoms = [
'Li',
'Na',
'Al',
'Si',
'Cl',
'Sc',
'Zn',
'As',
'Se',
'Br',
'Sn',
'Te',
'Cn',
'H',
'B',
'C',
'N',
'O',
'F',
'P',
'S',
'K',
'V',
'I',
]
special = [
'(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
]
padding = ['G', 'A', 'E']
self.table = sorted(atoms, key=len, reverse=True) + special + padding
self.table_len = len(self.table)
self.one_hot_dict = {}
for i, symbol in enumerate(self.table):
vec = np.zeros(self.table_len, dtype=np.float32)
vec[i] = 1
self.one_hot_dict[symbol] = vec
def tokenize(self, smiles):
N = len(smiles)
i = 0
token = []
timeout = time.time() + 5 # 5 seconds from now
while (i < N):
for j in range(self.table_len):
symbol = self.table[j]
if symbol == smiles[i:i + len(symbol)]:
token.append(symbol)
i += len(symbol)
break
if time.time() > timeout:
break
return token
def one_hot_encode(self, tokenized_smiles):
result = np.array(
[self.one_hot_dict[symbol] for symbol in tokenized_smiles],
dtype=np.float32)
result = result.reshape(1, result.shape[0], result.shape[1])
return result

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

30
app/prod/config.json Normal file
View File

@ -0,0 +1,30 @@
{
"exp_name": "LSTM_Chem",
"data_filename": "./datasets/all_smiles_clean.txt",
"data_length": 0,
"units": 256,
"num_epochs": 42,
"optimizer": "adam",
"seed": 71,
"batch_size": 512,
"validation_split": 0.1,
"verbose_training": true,
"checkpoint_monitor": "val_loss",
"checkpoint_mode": "min",
"checkpoint_save_best_only": false,
"checkpoint_save_weights_only": true,
"checkpoint_verbose": 1,
"tensorboard_write_graph": true,
"sampling_temp": 0.75,
"smiles_max_length": 128,
"finetune_epochs": 12,
"finetune_batch_size": 1,
"finetune_data_filename": "./datasets/protease_inhibitors_for_fine-tune.txt",
"config_file": "experiments/base_experiment/LSTM_Chem/config.json",
"exp_dir": "experiments/2020-07-13/LSTM_Chem",
"tensorboard_log_dir": "experiments/2020-07-13/LSTM_Chem/logs/",
"checkpoint_dir": "experiments/2020-07-13/LSTM_Chem/checkpoints/",
"train_smi_max_len": 128,
"model_arch_filename": "experiments/2020-07-13/LSTM_Chem/model_arch.json",
"model_weight_filename": "experiments/2020-07-13/LSTM_Chem/checkpoints/LSTM_Chem-42-0.23.hdf5"
}

149
app/prod/model_arch.json Normal file
View File

@ -0,0 +1,149 @@
{
"class_name": "Sequential",
"config": {
"name": "sequential",
"layers": [
{
"class_name": "LSTM",
"config": {
"name": "lstm",
"trainable": true,
"batch_input_shape": [
null,
null,
52
],
"dtype": "float32",
"return_sequences": true,
"return_state": false,
"go_backwards": false,
"stateful": false,
"unroll": false,
"time_major": false,
"units": 256,
"activation": "tanh",
"recurrent_activation": "sigmoid",
"use_bias": true,
"kernel_initializer": {
"class_name": "RandomNormal",
"config": {
"mean": 0.0,
"stddev": 0.05,
"seed": 71
}
},
"recurrent_initializer": {
"class_name": "Orthogonal",
"config": {
"gain": 1.0,
"seed": null
}
},
"bias_initializer": {
"class_name": "Zeros",
"config": {}
},
"unit_forget_bias": true,
"kernel_regularizer": null,
"recurrent_regularizer": null,
"bias_regularizer": null,
"activity_regularizer": null,
"kernel_constraint": null,
"recurrent_constraint": null,
"bias_constraint": null,
"dropout": 0.3,
"recurrent_dropout": 0.0,
"implementation": 2
}
},
{
"class_name": "LSTM",
"config": {
"name": "lstm_1",
"trainable": true,
"batch_input_shape": [
null,
null,
52
],
"dtype": "float32",
"return_sequences": true,
"return_state": false,
"go_backwards": false,
"stateful": false,
"unroll": false,
"time_major": false,
"units": 256,
"activation": "tanh",
"recurrent_activation": "sigmoid",
"use_bias": true,
"kernel_initializer": {
"class_name": "RandomNormal",
"config": {
"mean": 0.0,
"stddev": 0.05,
"seed": 71
}
},
"recurrent_initializer": {
"class_name": "Orthogonal",
"config": {
"gain": 1.0,
"seed": null
}
},
"bias_initializer": {
"class_name": "Zeros",
"config": {}
},
"unit_forget_bias": true,
"kernel_regularizer": null,
"recurrent_regularizer": null,
"bias_regularizer": null,
"activity_regularizer": null,
"kernel_constraint": null,
"recurrent_constraint": null,
"bias_constraint": null,
"dropout": 0.5,
"recurrent_dropout": 0.0,
"implementation": 2
}
},
{
"class_name": "Dense",
"config": {
"name": "dense",
"trainable": true,
"dtype": "float32",
"units": 52,
"activation": "softmax",
"use_bias": true,
"kernel_initializer": {
"class_name": "RandomNormal",
"config": {
"mean": 0.0,
"stddev": 0.05,
"seed": 71
}
},
"bias_initializer": {
"class_name": "Zeros",
"config": {}
},
"kernel_regularizer": null,
"bias_regularizer": null,
"activity_regularizer": null,
"kernel_constraint": null,
"bias_constraint": null
}
}
],
"build_input_shape": [
null,
null,
52
]
},
"keras_version": "2.3.0-tf",
"backend": "tensorflow"
}