73 lines
1.8 KiB
Python
73 lines
1.8 KiB
Python
|
import copy
|
||
|
import numpy as np
|
||
|
|
||
|
import time
|
||
|
|
||
|
|
||
|
class SmilesTokenizer(object):
|
||
|
def __init__(self):
|
||
|
atoms = [
|
||
|
'Li',
|
||
|
'Na',
|
||
|
'Al',
|
||
|
'Si',
|
||
|
'Cl',
|
||
|
'Sc',
|
||
|
'Zn',
|
||
|
'As',
|
||
|
'Se',
|
||
|
'Br',
|
||
|
'Sn',
|
||
|
'Te',
|
||
|
'Cn',
|
||
|
'H',
|
||
|
'B',
|
||
|
'C',
|
||
|
'N',
|
||
|
'O',
|
||
|
'F',
|
||
|
'P',
|
||
|
'S',
|
||
|
'K',
|
||
|
'V',
|
||
|
'I',
|
||
|
]
|
||
|
special = [
|
||
|
'(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
|
||
|
'6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
|
||
|
]
|
||
|
padding = ['G', 'A', 'E']
|
||
|
|
||
|
self.table = sorted(atoms, key=len, reverse=True) + special + padding
|
||
|
self.table_len = len(self.table)
|
||
|
|
||
|
self.one_hot_dict = {}
|
||
|
for i, symbol in enumerate(self.table):
|
||
|
vec = np.zeros(self.table_len, dtype=np.float32)
|
||
|
vec[i] = 1
|
||
|
self.one_hot_dict[symbol] = vec
|
||
|
|
||
|
def tokenize(self, smiles):
|
||
|
N = len(smiles)
|
||
|
i = 0
|
||
|
token = []
|
||
|
|
||
|
timeout = time.time() + 5 # 5 seconds from now
|
||
|
while (i < N):
|
||
|
for j in range(self.table_len):
|
||
|
symbol = self.table[j]
|
||
|
if symbol == smiles[i:i + len(symbol)]:
|
||
|
token.append(symbol)
|
||
|
i += len(symbol)
|
||
|
break
|
||
|
if time.time() > timeout:
|
||
|
break
|
||
|
return token
|
||
|
|
||
|
def one_hot_encode(self, tokenized_smiles):
|
||
|
result = np.array(
|
||
|
[self.one_hot_dict[symbol] for symbol in tokenized_smiles],
|
||
|
dtype=np.float32)
|
||
|
result = result.reshape(1, result.shape[0], result.shape[1])
|
||
|
return result
|