From cd353259b6032255cfc4e31a8a8bfd595a1751b2 Mon Sep 17 00:00:00 2001 From: Lyubov Yamshchikova <43475193+YamLyubov@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:49:14 +0300 Subject: [PATCH] Add molecule encodings for contextual bandit (#179) * Add molecule encodings for contextual bandit * Add pretrained models loading from github * Pep8 * Fix test * Add descriptors * Correct requirements * Add fingerprints * Add code reference * Review fixes * Fix test * Fix typing * PEP8 * minor * PEP8 --- examples/molecule_search/mol_encoders.py | 212 ++++++++++++++++ examples/molecule_search/mol_metrics.py | 13 +- .../mol_transformer/LICENSE.txt | 21 ++ .../mol_transformer/__init__.py | 0 .../mol_transformer/transformer.py | 240 ++++++++++++++++++ examples/molecule_search/utils.py | 20 +- golem/core/optimisers/genetic/gp_params.py | 4 +- other_requirements/molecules.txt | 5 +- 8 files changed, 500 insertions(+), 15 deletions(-) create mode 100644 examples/molecule_search/mol_encoders.py create mode 100644 examples/molecule_search/mol_transformer/LICENSE.txt create mode 100644 examples/molecule_search/mol_transformer/__init__.py create mode 100644 examples/molecule_search/mol_transformer/transformer.py diff --git a/examples/molecule_search/mol_encoders.py b/examples/molecule_search/mol_encoders.py new file mode 100644 index 00000000..b0e62a5d --- /dev/null +++ b/examples/molecule_search/mol_encoders.py @@ -0,0 +1,212 @@ +import os +from typing import Any, List, Optional + +import numpy as np +import torch +from gensim.models import word2vec, Word2Vec +from mol2vec.features import mol2alt_sentence, MolSentence +from rdkit.Chem import AllChem, RDKFingerprint, rdFingerprintGenerator +from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator + +from examples.molecule_search.mol_adapter import MolAdapter +from examples.molecule_search.mol_transformer.transformer import create_masks, Transformer, EXTRA_CHARS, ALPHABET_SIZE +from examples.molecule_search.utils import download_from_github +from golem.core.log import default_log +from golem.core.paths import project_root + + +def adapter_func_to_molgraph(func): + """ Decorator function to adapt observation to MolGraphs graphs. """ + def wrapper(obs): + mol_graph = MolAdapter().restore(obs) + embedding = func(mol_graph) + return embedding + return wrapper + + +def adapter_method_to_molgraph(func): + """ Decorator function to adapt observation to MolGraphs graphs. """ + def wrapper(obj, obs): + mol_graph = MolAdapter().restore(obs) + embedding = func(obj, mol_graph) + return embedding + return wrapper + + +@adapter_func_to_molgraph +def ECFP(obs: Any): + """ Extended-Connectivity Fingerprint """ + molecule = obs.get_rw_molecule() + feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule, + radius=2, + nBits=2**10, + useFeatures=False, + useChirality=False) + return np.array(feature_list) + + +@adapter_func_to_molgraph +def RDKF(obs: Any): + """ RDK Fingerprint """ + molecule = obs.get_rw_molecule() + fingerprint_rdk = RDKFingerprint(molecule) + return np.array(fingerprint_rdk) + + +@adapter_func_to_molgraph +def atom_pair(obs: Any): + """ Atom pair fingerprint """ + molecule = obs.get_rw_molecule() + fingerprint = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=1024).GetFingerprint(molecule) + return np.array(fingerprint) + + +@adapter_func_to_molgraph +def topological_torsion(obs: Any): + """ Topological Torsion fingerprint """ + molecule = obs.get_rw_molecule() + fingerprint = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=1024).GetFingerprint(molecule) + return np.array(fingerprint) + + +@adapter_func_to_molgraph +def mol_descriptors(obs: Any): + molecule = obs.get_rw_molecule() + chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', + 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', + 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', + 'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', + 'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt', + 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', + 'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', + 'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', + 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', + 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', + 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', + 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', + 'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', + 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', + 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10', + 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', + 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', + 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', + 'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', + 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', + 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', + 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', + 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', + 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', + 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', + 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', + 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', + 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', + 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', + 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', + 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', + 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', + 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', + 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', + 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed'] + mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors) + list_of_descriptor_vals = list(mol_descriptor_calculator.CalcDescriptors(molecule)) + return list_of_descriptor_vals + + +class Mol2Vec: + + PRETRAINED_WORD2VEC = 'examples/molecule_search/data/pretrained_models/model_300dim.pkl' + GITHUB_URL = 'https://github.com/samoturk/mol2vec/raw/master/examples/models/model_300dim.pkl' + + def __init__(self): + self.file_path = os.path.join(project_root(), Mol2Vec.PRETRAINED_WORD2VEC) + download_from_github(self.file_path, + Mol2Vec.GITHUB_URL, + message="Downloading pretrained model for molecules encoding...") + + self.model = word2vec.Word2Vec.load(self.file_path) + + @adapter_method_to_molgraph + def __call__(self, obs: Any): + molecule = obs.get_rw_molecule() + sentence = MolSentence(mol2alt_sentence(molecule, radius=1)) + embedding = self.sentences2vec([sentence], self.model, unseen='UNK')[0] + return np.array(embedding).astype(float) + + @staticmethod + def sentences2vec(sentences: List[MolSentence], model: Word2Vec, unseen: Optional[str] = None) -> np.array: + """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a + sum of vectors for individual words. + + Parameters + ---------- + sentences : list, array + List with sentences + model : word2vec.Word2Vec + Gensim word2vec model + unseen : None, str + Keyword for unseen words. If None, those words are skipped. + https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032 + + Returns + ------- + np.array + """ + + keys = set(model.wv.key_to_index) + vec = [] + + if unseen: + unseen_vec = model.wv.get_vector(unseen) + + for sentence in sentences: + if unseen: + vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys + else unseen_vec for y in sentence])) + else: + vec.append(sum([model.wv.get_vector(y) for y in sentence + if y in set(sentence) & keys])) + return np.array(vec) + + +class MoleculeTransformer: + """ Based on https://github.com/mpcrlab/MolecularTransformerEmbeddings """ + + PRETRAINED_TRANSFORMER = 'examples/molecule_search/data/pretrained_models/pretrained.ckpt' + GITHUB_URL = 'https://github.com/mpcrlab/MolecularTransformerEmbeddings/releases/download/' \ + 'checkpoints/pretrained.ckpt' + + def __init__(self, embedding_size: int = 512, num_layers: int = 6, max_length: int = 256): + self.log = default_log(self) + + self.file_path = os.path.join(project_root(), MoleculeTransformer.PRETRAINED_TRANSFORMER) + download_from_github(self.file_path, + MoleculeTransformer.GITHUB_URL, + message="Downloading pretrained model for molecules encoding...") + self.model = self._model_setup(embedding_size, num_layers) + self.encoder = self.model.encoder.cpu() + self.max_length = max_length + + def _model_setup(self, embedding_size: int, num_layers: int): + model = Transformer(ALPHABET_SIZE, embedding_size, num_layers).eval() + model = torch.nn.DataParallel(model) + checkpoint = torch.load(self.file_path, map_location=torch.device("cpu")) + model.load_state_dict(checkpoint['state_dict']) + return model.module.cpu() + + @adapter_method_to_molgraph + def __call__(self, obs: Any): + smiles = obs.get_smiles() + with torch.no_grad(): + encoded = self.encode_smiles(smiles) + mask = create_masks(encoded) + embedding = self.encoder(encoded, mask)[0].numpy() + embedding = embedding.mean(axis=0) + return embedding + + @staticmethod + def encode_char(c): + return ord(c) - 32 + + def encode_smiles(self, string: str, start_char=EXTRA_CHARS['seq_start']): + return torch.tensor([ord(start_char)] + + [self.encode_char(c) for c in string], dtype=torch.long)[:self.max_length].unsqueeze(0) diff --git a/examples/molecule_search/mol_metrics.py b/examples/molecule_search/mol_metrics.py index 67804dbb..20fe1e47 100644 --- a/examples/molecule_search/mol_metrics.py +++ b/examples/molecule_search/mol_metrics.py @@ -1,18 +1,17 @@ import os import pickle import sys +from typing import Dict -import requests from rdkit import RDConfig, Chem from rdkit.Chem import Descriptors, AllChem from rdkit.Chem.QED import qed from rdkit.Chem.rdchem import RWMol -from typing import Dict from examples.molecule_search.constants import ZINC_LOGP_MEAN, ZINC_LOGP_STD, ZINC_SA_MEAN, ZINC_SA_STD, \ ZINC_CYCLE_MEAN, ZINC_CYCLE_STD, MIN_LONG_CYCLE_SIZE from examples.molecule_search.mol_graph import MolGraph -from examples.molecule_search.utils import largest_ring_size +from examples.molecule_search.utils import largest_ring_size, download_from_github from golem.core.paths import project_root sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) @@ -160,13 +159,7 @@ def __call__(self, mol_graph: MolGraph) -> float: return -avg_score def load_shingles(self) -> Dict: - save_dir = os.path.dirname(self.file_path) - os.makedirs(save_dir, exist_ok=True) - - if not os.path.exists(self.file_path): - response = requests.get(self.github_url) - with open(self.file_path, "wb") as new_file: - new_file.write(response.content) + download_from_github(self.file_path, self.github_url) with open(self.file_path, "rb") as pyc: db_shingles = pickle.load(pyc) diff --git a/examples/molecule_search/mol_transformer/LICENSE.txt b/examples/molecule_search/mol_transformer/LICENSE.txt new file mode 100644 index 00000000..fcbfc09f --- /dev/null +++ b/examples/molecule_search/mol_transformer/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Machine Perception & Cognitive Robotics Laboratory + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/examples/molecule_search/mol_transformer/__init__.py b/examples/molecule_search/mol_transformer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/molecule_search/mol_transformer/transformer.py b/examples/molecule_search/mol_transformer/transformer.py new file mode 100644 index 00000000..1e13d0ca --- /dev/null +++ b/examples/molecule_search/mol_transformer/transformer.py @@ -0,0 +1,240 @@ +import copy +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable + +"""Code from https://github.com/mpcrlab/MolecularTransformerEmbeddings""" + +PRINTABLE_ASCII_CHARS = 95 + +_extra_chars = ["seq_start", "seq_end", "pad"] +EXTRA_CHARS = {key: chr(PRINTABLE_ASCII_CHARS + i) for i, key in enumerate(_extra_chars)} +ALPHABET_SIZE = PRINTABLE_ASCII_CHARS + len(EXTRA_CHARS) + + +class Embedding(nn.Module): + def __init__(self, alphabet_size, d_model): + super().__init__() + self.alphabet_size = alphabet_size + self.d_model = d_model + self.embed = nn.Embedding(alphabet_size, d_model) + + def forward(self, x): + return self.embed(x) + + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len=6000, dropout=0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(p=dropout) + # create constant 'pe' matrix with values dependant on pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model))) + pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + # add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:, :seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + x = self.dropout(x) + return x + + +class Norm(nn.Module): + def __init__(self, d_model, eps=1e-6): + super().__init__() + self.size = d_model + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + + +def attention(q, k, v, d_k, mask=None, dropout=None): + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + scores = F.softmax(scores, dim=-1) + if dropout is not None: + scores = dropout(scores) + output = torch.matmul(scores, v) + return output + + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout=0.1): + super().__init__() + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + bs = q.size(0) + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1, 2) + q = q.transpose(1, 2) + v = v.transpose(1, 2) + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model) + output = self.out(concat) + return output + + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout=0.1): + super().__init__() + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2, x2, x2, mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +def get_clones(module, n): + return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) + + +class Encoder(nn.Module): + def __init__(self, alphabet_size, d_model, n, heads, dropout): + super().__init__() + self.n = n + self.embed = Embedding(alphabet_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), n) + self.norm = Norm(d_model) + + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.n): + x = self.layers[i](x, mask) + return self.norm(x) + + +class Decoder(nn.Module): + def __init__(self, alphabet_size, d_model, n, heads, dropout): + super().__init__() + self.n = n + self.embed = Embedding(alphabet_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), n) + self.norm = Norm(d_model) + + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.n): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + + +class Transformer(nn.Module): + def __init__(self, alphabet_size, d_model, n, heads=8, dropout=0.1): + super().__init__() + self.encoder = Encoder(alphabet_size, d_model, n, heads, dropout) + self.decoder = Decoder(alphabet_size, d_model, n, heads, dropout) + self.out = nn.Linear(d_model, alphabet_size) + + def forward(self, src, trg, src_mask, trg_mask): + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + output = self.out(d_output) + return output + + +def nopeak_mask(size, device): + np_mask = torch.triu(torch.ones((size, size), dtype=torch.uint8), diagonal=1).unsqueeze(0) + np_mask = np_mask == 0 + np_mask = np_mask.to(device) + return np_mask + + +def create_masks(src, trg=None, pad_idx=ord(EXTRA_CHARS['pad']), device=None): + src_mask = (src != pad_idx).unsqueeze(-2) + + if trg is not None: + trg_mask = (trg != pad_idx).unsqueeze(-2) + size = trg.size(1) # get seq_len for matrix + np_mask = nopeak_mask(size, device) + np_mask.to(device) + trg_mask = trg_mask & np_mask + return src_mask, trg_mask + return src_mask diff --git a/examples/molecule_search/utils.py b/examples/molecule_search/utils.py index 31b58d38..74d3c10a 100644 --- a/examples/molecule_search/utils.py +++ b/examples/molecule_search/utils.py @@ -1,12 +1,14 @@ -from copy import deepcopy +import os +from typing import Tuple, Set, Optional import networkx as nx from rdkit.Chem import GetPeriodicTable from rdkit.Chem.rdchem import Atom, RWMol -from typing import Tuple, Set +from sphinx.util import requests from examples.molecule_search.constants import SULFUR_DEFAULT_VALENCE from examples.molecule_search.mol_graph import MolGraph +from golem.core.log import default_log def get_default_valence(atom_type: str) -> int: @@ -39,3 +41,17 @@ def largest_ring_size(rw_molecule: RWMol) -> int: if cycle_list: largest_cycle_len = max(map(len, cycle_list)) return largest_cycle_len + + +def download_from_github(save_path: str, github_url: str, message: Optional[str] = None): + """ Checks if the file exists. If not downloads the file from specified url.""" + save_dir = os.path.dirname(save_path) + os.makedirs(save_dir, exist_ok=True) + + message = message or f"Downloading a file from {github_url} to {save_dir}..." + + if not os.path.exists(save_path): + default_log().message(message) + response = requests.get(github_url) + with open(save_path, "wb") as new_file: + new_file.write(response.content) diff --git a/golem/core/optimisers/genetic/gp_params.py b/golem/core/optimisers/genetic/gp_params.py index 42749027..d893cb6e 100644 --- a/golem/core/optimisers/genetic/gp_params.py +++ b/golem/core/optimisers/genetic/gp_params.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Sequence, Union, Any +from typing import Sequence, Union, Any, Callable from golem.core.optimisers.adaptive.operator_agent import MutationAgentTypeEnum from golem.core.optimisers.adaptive.mab_agents.neural_contextual_mab_agent import ContextAgentTypeEnum @@ -76,7 +76,7 @@ class GPAlgorithmParameters(AlgorithmParameters): required_valid_ratio: float = 0.9 adaptive_mutation_type: MutationAgentTypeEnum = MutationAgentTypeEnum.default - context_agent_type: ContextAgentTypeEnum = ContextAgentTypeEnum.nodes_num + context_agent_type: Union[ContextAgentTypeEnum, Callable] = ContextAgentTypeEnum.nodes_num selection_types: Sequence[SelectionTypesEnum] = \ (SelectionTypesEnum.tournament,) diff --git a/other_requirements/molecules.txt b/other_requirements/molecules.txt index 73f2a0bc..e8021e9b 100644 --- a/other_requirements/molecules.txt +++ b/other_requirements/molecules.txt @@ -1,4 +1,7 @@ rdkit>=2018.09.1.0 guacamol>=0.5.4 joblib>=0.12.5 -requests>=2.30.0 \ No newline at end of file +requests>=2.30.0 +mol2vec @ git+https://github.com/samoturk/mol2vec +gensim>=4.3.2 +torch>=2.0.1 \ No newline at end of file