diff --git a/app/lstm_chem/__init__.py b/app/lstm_chem/__init__.py new file mode 100755 index 0000000..8b13789 --- /dev/null +++ b/app/lstm_chem/__init__.py @@ -0,0 +1 @@ + diff --git a/app/lstm_chem/data_loader.py b/app/lstm_chem/data_loader.py new file mode 100755 index 0000000..86ddbba --- /dev/null +++ b/app/lstm_chem/data_loader.py @@ -0,0 +1,122 @@ +import json +import os +import numpy as np +from tqdm import tqdm +from tensorflow.keras.utils import Sequence +from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer + + +class DataLoader(Sequence): + def __init__(self, config, data_type='train'): + self.config = config + self.data_type = data_type + assert self.data_type in ['train', 'valid', 'finetune'] + + self.max_len = 0 + + if self.data_type == 'train': + self.smiles = self._load(self.config.data_filename) + elif self.data_type == 'finetune': + self.smiles = self._load(self.config.finetune_data_filename) + else: + pass + + self.st = SmilesTokenizer() + self.one_hot_dict = self.st.one_hot_dict + + self.tokenized_smiles = self._tokenize(self.smiles) + + if self.data_type in ['train', 'valid']: + self.idx = np.arange(len(self.tokenized_smiles)) + self.valid_size = int( + np.ceil( + len(self.tokenized_smiles) * self.config.validation_split)) + np.random.seed(self.config.seed) + np.random.shuffle(self.idx) + + def _set_data(self): + if self.data_type == 'train': + ret = [ + self.tokenized_smiles[self.idx[i]] + for i in self.idx[self.valid_size:] + ] + elif self.data_type == 'valid': + ret = [ + self.tokenized_smiles[self.idx[i]] + for i in self.idx[:self.valid_size] + ] + else: + ret = self.tokenized_smiles + return ret + + def _load(self, data_filename): + length = self.config.data_length + print('loading SMILES...') + with open(data_filename) as f: + smiles = [s.rstrip() for s in f] + if length != 0: + smiles = smiles[:length] + print('done.') + return smiles + + def _tokenize(self, smiles): + assert isinstance(smiles, list) + print('tokenizing SMILES...') + tokenized_smiles = [self.st.tokenize(smi) for smi in tqdm(smiles)] + + if self.data_type == 'train': + for tokenized_smi in tokenized_smiles: + length = len(tokenized_smi) + if self.max_len < length: + self.max_len = length + self.config.train_smi_max_len = self.max_len + print('done.') + return tokenized_smiles + + def __len__(self): + target_tokenized_smiles = self._set_data() + if self.data_type in ['train', 'valid']: + ret = int( + np.ceil( + len(target_tokenized_smiles) / + float(self.config.batch_size))) + else: + ret = int( + np.ceil( + len(target_tokenized_smiles) / + float(self.config.finetune_batch_size))) + return ret + + def __getitem__(self, idx): + target_tokenized_smiles = self._set_data() + if self.data_type in ['train', 'valid']: + data = target_tokenized_smiles[idx * + self.config.batch_size:(idx + 1) * + self.config.batch_size] + else: + data = target_tokenized_smiles[idx * + self.config.finetune_batch_size: + (idx + 1) * + self.config.finetune_batch_size] + data = self._padding(data) + + self.X, self.y = [], [] + for tp_smi in data: + X = [self.one_hot_dict[symbol] for symbol in tp_smi[:-1]] + self.X.append(X) + y = [self.one_hot_dict[symbol] for symbol in tp_smi[1:]] + self.y.append(y) + + self.X = np.array(self.X, dtype=np.float32) + self.y = np.array(self.y, dtype=np.float32) + + return self.X, self.y + + def _pad(self, tokenized_smi): + return ['G'] + tokenized_smi + ['E'] + [ + 'A' for _ in range(self.max_len - len(tokenized_smi)) + ] + + def _padding(self, data): + padded_smiles = [self._pad(t_smi) for t_smi in data] + return padded_smiles diff --git a/app/lstm_chem/finetuner.py b/app/lstm_chem/finetuner.py new file mode 100755 index 0000000..904958b --- /dev/null +++ b/app/lstm_chem/finetuner.py @@ -0,0 +1,24 @@ +from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer +from lstm_chem.generator import LSTMChemGenerator + + +class LSTMChemFinetuner(LSTMChemGenerator): + def __init__(self, modeler, finetune_data_loader): + self.session = modeler.session + self.model = modeler.model + self.config = modeler.config + self.finetune_data_loader = finetune_data_loader + self.st = SmilesTokenizer() + + def finetune(self): + self.model.compile(optimizer=self.config.optimizer, + loss='categorical_crossentropy') + + history = self.model.fit_generator( + self.finetune_data_loader, + steps_per_epoch=self.finetune_data_loader.__len__(), + epochs=self.config.finetune_epochs, + verbose=self.config.verbose_training, + use_multiprocessing=True, + shuffle=True) + return history diff --git a/app/lstm_chem/generator.py b/app/lstm_chem/generator.py new file mode 100755 index 0000000..498f864 --- /dev/null +++ b/app/lstm_chem/generator.py @@ -0,0 +1,44 @@ +from tqdm import tqdm +import numpy as np +from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer + + +class LSTMChemGenerator(object): + def __init__(self, modeler): + self.session = modeler.session + self.model = modeler.model + self.config = modeler.config + self.st = SmilesTokenizer() + + def _generate(self, sequence): + while (sequence[-1] != 'E') and (len(self.st.tokenize(sequence)) <= + self.config.smiles_max_length): + x = self.st.one_hot_encode(self.st.tokenize(sequence)) + preds = self.model.predict_on_batch(x)[0][-1] + next_idx = self.sample_with_temp(preds) + sequence += self.st.table[next_idx] + + sequence = sequence[1:].rstrip('E') + return sequence + + def sample_with_temp(self, preds): + streched = np.log(preds) / self.config.sampling_temp + streched_probs = np.exp(streched) / np.sum(np.exp(streched)) + return np.random.choice(range(len(streched)), p=streched_probs) + + def sample(self, num=1, start='G'): + sampled = [] + if self.session == 'generate': + for _ in tqdm(range(num)): + sampled.append(self._generate(start)) + return sampled + else: + from rdkit import Chem, RDLogger + RDLogger.DisableLog('rdApp.*') + while len(sampled) < num: + sequence = self._generate(start) + mol = Chem.MolFromSmiles(sequence) + if mol is not None: + canon_smiles = Chem.MolToSmiles(mol) + sampled.append(canon_smiles) + return sampled diff --git a/app/lstm_chem/model.py b/app/lstm_chem/model.py new file mode 100755 index 0000000..079589a --- /dev/null +++ b/app/lstm_chem/model.py @@ -0,0 +1,73 @@ +import os +import time +from tensorflow.keras import Sequential +from tensorflow.keras.models import model_from_json +from tensorflow.keras.layers import LSTM, Dense +from tensorflow.keras.initializers import RandomNormal +from lstm_chem.utils.smiles_tokenizer import SmilesTokenizer + + +class LSTMChem(object): + def __init__(self, config, session='train'): + assert session in ['train', 'generate', 'finetune'], \ + 'one of {train, generate, finetune}' + + self.config = config + self.session = session + self.model = None + + if self.session == 'train': + self.build_model() + else: + self.model = self.load(self.config.model_arch_filename, + self.config.model_weight_filename) + + def build_model(self): + st = SmilesTokenizer() + n_table = len(st.table) + weight_init = RandomNormal(mean=0.0, + stddev=0.05, + seed=self.config.seed) + + self.model = Sequential() + self.model.add( + LSTM(units=self.config.units, + input_shape=(None, n_table), + return_sequences=True, + kernel_initializer=weight_init, + dropout=0.3)) + self.model.add( + LSTM(units=self.config.units, + input_shape=(None, n_table), + return_sequences=True, + kernel_initializer=weight_init, + dropout=0.5)) + self.model.add( + Dense(units=n_table, + activation='softmax', + kernel_initializer=weight_init)) + + arch = self.model.to_json(indent=2) + self.config.model_arch_filename = os.path.join(self.config.exp_dir, + 'model_arch.json') + with open(self.config.model_arch_filename, 'w') as f: + f.write(arch) + + self.model.compile(optimizer=self.config.optimizer, + loss='categorical_crossentropy') + + def save(self, checkpoint_path): + assert self.model, 'You have to build the model first.' + + print('Saving model ...') + self.model.save_weights(checkpoint_path) + print('model saved.') + + def load(self, model_arch_file, checkpoint_file): + print(f'Loading model architecture from {model_arch_file} ...') + with open(model_arch_file) as f: + model = model_from_json(f.read()) + print(f'Loading model checkpoint from {checkpoint_file} ...') + model.load_weights(checkpoint_file) + print('Loaded the Model.') + return model diff --git a/app/lstm_chem/trainer.py b/app/lstm_chem/trainer.py new file mode 100755 index 0000000..4e8057e --- /dev/null +++ b/app/lstm_chem/trainer.py @@ -0,0 +1,56 @@ +from glob import glob +import os +from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard + + +class LSTMChemTrainer(object): + def __init__(self, modeler, train_data_loader, valid_data_loader): + self.model = modeler.model + self.config = modeler.config + self.train_data_loader = train_data_loader + self.valid_data_loader = valid_data_loader + self.callbacks = [] + self.init_callbacks() + + def init_callbacks(self): + self.callbacks.append( + ModelCheckpoint( + filepath=os.path.join( + self.config.checkpoint_dir, + '%s-{epoch:02d}-{val_loss:.2f}.hdf5' % + self.config.exp_name), + monitor=self.config.checkpoint_monitor, + mode=self.config.checkpoint_mode, + save_best_only=self.config.checkpoint_save_best_only, + save_weights_only=self.config.checkpoint_save_weights_only, + verbose=self.config.checkpoint_verbose, + )) + self.callbacks.append( + TensorBoard( + log_dir=self.config.tensorboard_log_dir, + write_graph=self.config.tensorboard_write_graph, + )) + + def train(self): + history = self.model.fit_generator( + self.train_data_loader, + steps_per_epoch=self.train_data_loader.__len__(), + epochs=self.config.num_epochs, + verbose=self.config.verbose_training, + validation_data=self.valid_data_loader, + validation_steps=self.valid_data_loader.__len__(), + use_multiprocessing=True, + shuffle=True, + callbacks=self.callbacks) + + last_weight_file = glob( + os.path.join( + f'{self.config.checkpoint_dir}', + f'{self.config.exp_name}-{self.config.num_epochs:02}*.hdf5') + )[0] + + assert os.path.exists(last_weight_file) + self.config.model_weight_filename = last_weight_file + + with open(os.path.join(self.config.exp_dir, 'config.json'), 'w') as f: + f.write(self.config.toJSON(indent=2)) diff --git a/app/lstm_chem/utils/config.py b/app/lstm_chem/utils/config.py new file mode 100755 index 0000000..fff7359 --- /dev/null +++ b/app/lstm_chem/utils/config.py @@ -0,0 +1,26 @@ +import os +import time +import json +from bunch import Bunch + + +def get_config_from_json(json_file): + with open(json_file, 'r') as config_file: + config_dict = json.load(config_file) + config = Bunch(config_dict) + return config + + +def process_config(json_file): + config = get_config_from_json(json_file) + config.config_file = json_file + config.exp_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name) + config.tensorboard_log_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'logs/') + config.checkpoint_dir = os.path.join( + 'experiments', time.strftime('%Y-%m-%d/', time.localtime()), + config.exp_name, 'checkpoints/') + return config diff --git a/app/lstm_chem/utils/dirs.py b/app/lstm_chem/utils/dirs.py new file mode 100755 index 0000000..bcd2a49 --- /dev/null +++ b/app/lstm_chem/utils/dirs.py @@ -0,0 +1,12 @@ +import os +import sys + + +def create_dirs(dirs): + try: + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + except Exception as err: + print(f'Creating directories error: {err}') + sys.exit() diff --git a/app/lstm_chem/utils/smiles_tokenizer.py b/app/lstm_chem/utils/smiles_tokenizer.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/app/lstm_chem/utils/smiles_tokenizer.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result diff --git a/app/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/app/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py new file mode 100755 index 0000000..d15d625 --- /dev/null +++ b/app/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py @@ -0,0 +1,72 @@ +import copy +import numpy as np + +import time + + +class SmilesTokenizer(object): + def __init__(self): + atoms = [ + 'Li', + 'Na', + 'Al', + 'Si', + 'Cl', + 'Sc', + 'Zn', + 'As', + 'Se', + 'Br', + 'Sn', + 'Te', + 'Cn', + 'H', + 'B', + 'C', + 'N', + 'O', + 'F', + 'P', + 'S', + 'K', + 'V', + 'I', + ] + special = [ + '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's' + ] + padding = ['G', 'A', 'E'] + + self.table = sorted(atoms, key=len, reverse=True) + special + padding + self.table_len = len(self.table) + + self.one_hot_dict = {} + for i, symbol in enumerate(self.table): + vec = np.zeros(self.table_len, dtype=np.float32) + vec[i] = 1 + self.one_hot_dict[symbol] = vec + + def tokenize(self, smiles): + N = len(smiles) + i = 0 + token = [] + + timeout = time.time() + 5 # 5 seconds from now + while (i < N): + for j in range(self.table_len): + symbol = self.table[j] + if symbol == smiles[i:i + len(symbol)]: + token.append(symbol) + i += len(symbol) + break + if time.time() > timeout: + break + return token + + def one_hot_encode(self, tokenized_smiles): + result = np.array( + [self.one_hot_dict[symbol] for symbol in tokenized_smiles], + dtype=np.float32) + result = result.reshape(1, result.shape[0], result.shape[1]) + return result diff --git a/app/prod/checkpoints/LSTM_Chem-01-0.58.hdf5 b/app/prod/checkpoints/LSTM_Chem-01-0.58.hdf5 new file mode 100644 index 0000000..50c3f9e Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-01-0.58.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-02-0.43.hdf5 b/app/prod/checkpoints/LSTM_Chem-02-0.43.hdf5 new file mode 100644 index 0000000..9fa1f36 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-02-0.43.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-03-0.38.hdf5 b/app/prod/checkpoints/LSTM_Chem-03-0.38.hdf5 new file mode 100644 index 0000000..0234965 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-03-0.38.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-04-0.35.hdf5 b/app/prod/checkpoints/LSTM_Chem-04-0.35.hdf5 new file mode 100644 index 0000000..0758848 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-04-0.35.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-05-0.33.hdf5 b/app/prod/checkpoints/LSTM_Chem-05-0.33.hdf5 new file mode 100644 index 0000000..f46d338 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-05-0.33.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-06-0.32.hdf5 b/app/prod/checkpoints/LSTM_Chem-06-0.32.hdf5 new file mode 100644 index 0000000..02d3154 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-06-0.32.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-07-0.31.hdf5 b/app/prod/checkpoints/LSTM_Chem-07-0.31.hdf5 new file mode 100644 index 0000000..1c2718a Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-07-0.31.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-08-0.30.hdf5 b/app/prod/checkpoints/LSTM_Chem-08-0.30.hdf5 new file mode 100644 index 0000000..f2985a1 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-08-0.30.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-09-0.29.hdf5 b/app/prod/checkpoints/LSTM_Chem-09-0.29.hdf5 new file mode 100644 index 0000000..3a8efa2 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-09-0.29.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-10-0.29.hdf5 b/app/prod/checkpoints/LSTM_Chem-10-0.29.hdf5 new file mode 100644 index 0000000..38f6136 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-10-0.29.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-11-0.28.hdf5 b/app/prod/checkpoints/LSTM_Chem-11-0.28.hdf5 new file mode 100644 index 0000000..c2d5a03 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-11-0.28.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-12-0.28.hdf5 b/app/prod/checkpoints/LSTM_Chem-12-0.28.hdf5 new file mode 100644 index 0000000..2224b23 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-12-0.28.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-13-0.28.hdf5 b/app/prod/checkpoints/LSTM_Chem-13-0.28.hdf5 new file mode 100644 index 0000000..8e3063e Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-13-0.28.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-14-0.27.hdf5 b/app/prod/checkpoints/LSTM_Chem-14-0.27.hdf5 new file mode 100644 index 0000000..39b1722 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-14-0.27.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-15-0.27.hdf5 b/app/prod/checkpoints/LSTM_Chem-15-0.27.hdf5 new file mode 100644 index 0000000..57f2a61 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-15-0.27.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-16-0.26.hdf5 b/app/prod/checkpoints/LSTM_Chem-16-0.26.hdf5 new file mode 100644 index 0000000..d60637a Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-16-0.26.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-17-0.26.hdf5 b/app/prod/checkpoints/LSTM_Chem-17-0.26.hdf5 new file mode 100644 index 0000000..7bdaa43 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-17-0.26.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-18-0.26.hdf5 b/app/prod/checkpoints/LSTM_Chem-18-0.26.hdf5 new file mode 100644 index 0000000..e8d04cd Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-18-0.26.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-19-0.26.hdf5 b/app/prod/checkpoints/LSTM_Chem-19-0.26.hdf5 new file mode 100644 index 0000000..a61d1d4 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-19-0.26.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-20-0.26.hdf5 b/app/prod/checkpoints/LSTM_Chem-20-0.26.hdf5 new file mode 100644 index 0000000..3c50da0 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-20-0.26.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-21-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-21-0.25.hdf5 new file mode 100644 index 0000000..f5eed7f Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-21-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-22-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-22-0.25.hdf5 new file mode 100644 index 0000000..21a1c98 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-22-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-23-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-23-0.25.hdf5 new file mode 100644 index 0000000..21c6a8f Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-23-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-24-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-24-0.25.hdf5 new file mode 100644 index 0000000..4a83e46 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-24-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-25-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-25-0.25.hdf5 new file mode 100644 index 0000000..e3c2126 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-25-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-26-0.25.hdf5 b/app/prod/checkpoints/LSTM_Chem-26-0.25.hdf5 new file mode 100644 index 0000000..f36f1b6 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-26-0.25.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-27-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-27-0.24.hdf5 new file mode 100644 index 0000000..e9b5134 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-27-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-28-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-28-0.24.hdf5 new file mode 100644 index 0000000..2226f68 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-28-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-29-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-29-0.24.hdf5 new file mode 100644 index 0000000..aaee10f Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-29-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-30-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-30-0.24.hdf5 new file mode 100644 index 0000000..1a9bf6a Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-30-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-31-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-31-0.24.hdf5 new file mode 100644 index 0000000..2d3d8ff Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-31-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-32-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-32-0.24.hdf5 new file mode 100644 index 0000000..ec54aa3 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-32-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-33-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-33-0.24.hdf5 new file mode 100644 index 0000000..8adf067 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-33-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-34-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-34-0.24.hdf5 new file mode 100644 index 0000000..76519de Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-34-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-35-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-35-0.24.hdf5 new file mode 100644 index 0000000..d85fde1 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-35-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-36-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-36-0.24.hdf5 new file mode 100644 index 0000000..d30fbe3 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-36-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-37-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-37-0.24.hdf5 new file mode 100644 index 0000000..56c1f4c Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-37-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-38-0.24.hdf5 b/app/prod/checkpoints/LSTM_Chem-38-0.24.hdf5 new file mode 100644 index 0000000..77f1542 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-38-0.24.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-39-0.23.hdf5 b/app/prod/checkpoints/LSTM_Chem-39-0.23.hdf5 new file mode 100644 index 0000000..f2309e0 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-39-0.23.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-40-0.23.hdf5 b/app/prod/checkpoints/LSTM_Chem-40-0.23.hdf5 new file mode 100644 index 0000000..1938a09 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-40-0.23.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-41-0.23.hdf5 b/app/prod/checkpoints/LSTM_Chem-41-0.23.hdf5 new file mode 100644 index 0000000..b43d8f5 Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-41-0.23.hdf5 differ diff --git a/app/prod/checkpoints/LSTM_Chem-42-0.23.hdf5 b/app/prod/checkpoints/LSTM_Chem-42-0.23.hdf5 new file mode 100644 index 0000000..717ab2d Binary files /dev/null and b/app/prod/checkpoints/LSTM_Chem-42-0.23.hdf5 differ diff --git a/app/prod/config.json b/app/prod/config.json new file mode 100644 index 0000000..4b72262 --- /dev/null +++ b/app/prod/config.json @@ -0,0 +1,30 @@ +{ + "exp_name": "LSTM_Chem", + "data_filename": "./datasets/all_smiles_clean.txt", + "data_length": 0, + "units": 256, + "num_epochs": 42, + "optimizer": "adam", + "seed": 71, + "batch_size": 512, + "validation_split": 0.1, + "verbose_training": true, + "checkpoint_monitor": "val_loss", + "checkpoint_mode": "min", + "checkpoint_save_best_only": false, + "checkpoint_save_weights_only": true, + "checkpoint_verbose": 1, + "tensorboard_write_graph": true, + "sampling_temp": 0.75, + "smiles_max_length": 128, + "finetune_epochs": 12, + "finetune_batch_size": 1, + "finetune_data_filename": "./datasets/protease_inhibitors_for_fine-tune.txt", + "config_file": "experiments/base_experiment/LSTM_Chem/config.json", + "exp_dir": "experiments/2020-07-13/LSTM_Chem", + "tensorboard_log_dir": "experiments/2020-07-13/LSTM_Chem/logs/", + "checkpoint_dir": "experiments/2020-07-13/LSTM_Chem/checkpoints/", + "train_smi_max_len": 128, + "model_arch_filename": "experiments/2020-07-13/LSTM_Chem/model_arch.json", + "model_weight_filename": "experiments/2020-07-13/LSTM_Chem/checkpoints/LSTM_Chem-42-0.23.hdf5" +} \ No newline at end of file diff --git a/app/prod/logs/train/events.out.tfevents.1594644989.Jiamin.local.31642.1238.v2 b/app/prod/logs/train/events.out.tfevents.1594644989.Jiamin.local.31642.1238.v2 new file mode 100644 index 0000000..52526ff Binary files /dev/null and b/app/prod/logs/train/events.out.tfevents.1594644989.Jiamin.local.31642.1238.v2 differ diff --git a/app/prod/logs/train/events.out.tfevents.1594645021.Jiamin.local.profile-empty b/app/prod/logs/train/events.out.tfevents.1594645021.Jiamin.local.profile-empty new file mode 100644 index 0000000..94d6821 Binary files /dev/null and b/app/prod/logs/train/events.out.tfevents.1594645021.Jiamin.local.profile-empty differ diff --git a/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.input_pipeline.pb b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.input_pipeline.pb new file mode 100644 index 0000000..2555aa7 Binary files /dev/null and b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.input_pipeline.pb differ diff --git a/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.kernel_stats.pb b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.kernel_stats.pb new file mode 100644 index 0000000..e69de29 diff --git a/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.overview_page.pb b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.overview_page.pb new file mode 100644 index 0000000..18cffd7 Binary files /dev/null and b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.overview_page.pb differ diff --git a/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.tensorflow_stats.pb b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.tensorflow_stats.pb new file mode 100644 index 0000000..8c5eb9e Binary files /dev/null and b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.tensorflow_stats.pb differ diff --git a/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.trace.json.gz b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.trace.json.gz new file mode 100644 index 0000000..9e4ffce Binary files /dev/null and b/app/prod/logs/train/plugins/profile/2020_07_13_20_57_01/Jiamin.local.trace.json.gz differ diff --git a/app/prod/logs/validation/events.out.tfevents.1594646871.Jiamin.local.31642.8717.v2 b/app/prod/logs/validation/events.out.tfevents.1594646871.Jiamin.local.31642.8717.v2 new file mode 100644 index 0000000..47519ee Binary files /dev/null and b/app/prod/logs/validation/events.out.tfevents.1594646871.Jiamin.local.31642.8717.v2 differ diff --git a/app/prod/model_arch.json b/app/prod/model_arch.json new file mode 100644 index 0000000..e41f8b2 --- /dev/null +++ b/app/prod/model_arch.json @@ -0,0 +1,149 @@ +{ + "class_name": "Sequential", + "config": { + "name": "sequential", + "layers": [ + { + "class_name": "LSTM", + "config": { + "name": "lstm", + "trainable": true, + "batch_input_shape": [ + null, + null, + 52 + ], + "dtype": "float32", + "return_sequences": true, + "return_state": false, + "go_backwards": false, + "stateful": false, + "unroll": false, + "time_major": false, + "units": 256, + "activation": "tanh", + "recurrent_activation": "sigmoid", + "use_bias": true, + "kernel_initializer": { + "class_name": "RandomNormal", + "config": { + "mean": 0.0, + "stddev": 0.05, + "seed": 71 + } + }, + "recurrent_initializer": { + "class_name": "Orthogonal", + "config": { + "gain": 1.0, + "seed": null + } + }, + "bias_initializer": { + "class_name": "Zeros", + "config": {} + }, + "unit_forget_bias": true, + "kernel_regularizer": null, + "recurrent_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "recurrent_constraint": null, + "bias_constraint": null, + "dropout": 0.3, + "recurrent_dropout": 0.0, + "implementation": 2 + } + }, + { + "class_name": "LSTM", + "config": { + "name": "lstm_1", + "trainable": true, + "batch_input_shape": [ + null, + null, + 52 + ], + "dtype": "float32", + "return_sequences": true, + "return_state": false, + "go_backwards": false, + "stateful": false, + "unroll": false, + "time_major": false, + "units": 256, + "activation": "tanh", + "recurrent_activation": "sigmoid", + "use_bias": true, + "kernel_initializer": { + "class_name": "RandomNormal", + "config": { + "mean": 0.0, + "stddev": 0.05, + "seed": 71 + } + }, + "recurrent_initializer": { + "class_name": "Orthogonal", + "config": { + "gain": 1.0, + "seed": null + } + }, + "bias_initializer": { + "class_name": "Zeros", + "config": {} + }, + "unit_forget_bias": true, + "kernel_regularizer": null, + "recurrent_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "recurrent_constraint": null, + "bias_constraint": null, + "dropout": 0.5, + "recurrent_dropout": 0.0, + "implementation": 2 + } + }, + { + "class_name": "Dense", + "config": { + "name": "dense", + "trainable": true, + "dtype": "float32", + "units": 52, + "activation": "softmax", + "use_bias": true, + "kernel_initializer": { + "class_name": "RandomNormal", + "config": { + "mean": 0.0, + "stddev": 0.05, + "seed": 71 + } + }, + "bias_initializer": { + "class_name": "Zeros", + "config": {} + }, + "kernel_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "bias_constraint": null + } + } + ], + "build_input_shape": [ + null, + null, + 52 + ] + }, + "keras_version": "2.3.0-tf", + "backend": "tensorflow" +} \ No newline at end of file