From cd353259b6032255cfc4e31a8a8bfd595a1751b2 Mon Sep 17 00:00:00 2001
From: Lyubov Yamshchikova <43475193+YamLyubov@users.noreply.github.com>
Date: Tue, 5 Sep 2023 18:49:14 +0300
Subject: [PATCH] Add molecule encodings for contextual bandit (#179)

* Add molecule encodings for contextual bandit

* Add pretrained models loading from github

* Pep8

* Fix test

* Add descriptors

* Correct requirements

* Add fingerprints

* Add code reference

* Review fixes

* Fix test

* Fix typing

* PEP8

* minor

* PEP8
---
 examples/molecule_search/mol_encoders.py      | 212 ++++++++++++++++
 examples/molecule_search/mol_metrics.py       |  13 +-
 .../mol_transformer/LICENSE.txt               |  21 ++
 .../mol_transformer/__init__.py               |   0
 .../mol_transformer/transformer.py            | 240 ++++++++++++++++++
 examples/molecule_search/utils.py             |  20 +-
 golem/core/optimisers/genetic/gp_params.py    |   4 +-
 other_requirements/molecules.txt              |   5 +-
 8 files changed, 500 insertions(+), 15 deletions(-)
 create mode 100644 examples/molecule_search/mol_encoders.py
 create mode 100644 examples/molecule_search/mol_transformer/LICENSE.txt
 create mode 100644 examples/molecule_search/mol_transformer/__init__.py
 create mode 100644 examples/molecule_search/mol_transformer/transformer.py

diff --git a/examples/molecule_search/mol_encoders.py b/examples/molecule_search/mol_encoders.py
new file mode 100644
index 00000000..b0e62a5d
--- /dev/null
+++ b/examples/molecule_search/mol_encoders.py
@@ -0,0 +1,212 @@
+import os
+from typing import Any, List, Optional
+
+import numpy as np
+import torch
+from gensim.models import word2vec, Word2Vec
+from mol2vec.features import mol2alt_sentence, MolSentence
+from rdkit.Chem import AllChem, RDKFingerprint, rdFingerprintGenerator
+from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
+
+from examples.molecule_search.mol_adapter import MolAdapter
+from examples.molecule_search.mol_transformer.transformer import create_masks, Transformer, EXTRA_CHARS, ALPHABET_SIZE
+from examples.molecule_search.utils import download_from_github
+from golem.core.log import default_log
+from golem.core.paths import project_root
+
+
+def adapter_func_to_molgraph(func):
+    """ Decorator function to adapt observation to MolGraphs graphs. """
+    def wrapper(obs):
+        mol_graph = MolAdapter().restore(obs)
+        embedding = func(mol_graph)
+        return embedding
+    return wrapper
+
+
+def adapter_method_to_molgraph(func):
+    """ Decorator function to adapt observation to MolGraphs graphs. """
+    def wrapper(obj, obs):
+        mol_graph = MolAdapter().restore(obs)
+        embedding = func(obj, mol_graph)
+        return embedding
+    return wrapper
+
+
+@adapter_func_to_molgraph
+def ECFP(obs: Any):
+    """ Extended-Connectivity Fingerprint """
+    molecule = obs.get_rw_molecule()
+    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
+                                                         radius=2,
+                                                         nBits=2**10,
+                                                         useFeatures=False,
+                                                         useChirality=False)
+    return np.array(feature_list)
+
+
+@adapter_func_to_molgraph
+def RDKF(obs: Any):
+    """ RDK Fingerprint """
+    molecule = obs.get_rw_molecule()
+    fingerprint_rdk = RDKFingerprint(molecule)
+    return np.array(fingerprint_rdk)
+
+
+@adapter_func_to_molgraph
+def atom_pair(obs: Any):
+    """ Atom pair fingerprint """
+    molecule = obs.get_rw_molecule()
+    fingerprint = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=1024).GetFingerprint(molecule)
+    return np.array(fingerprint)
+
+
+@adapter_func_to_molgraph
+def topological_torsion(obs: Any):
+    """ Topological Torsion fingerprint """
+    molecule = obs.get_rw_molecule()
+    fingerprint = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=1024).GetFingerprint(molecule)
+    return np.array(fingerprint)
+
+
+@adapter_func_to_molgraph
+def mol_descriptors(obs: Any):
+    molecule = obs.get_rw_molecule()
+    chosen_descriptors = ['BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v',
+                          'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11',
+                          'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7',
+                          'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2',
+                          'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 'HeavyAtomMolWt',
+                          'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge',
+                          'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge',
+                          'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount',
+                          'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings',
+                          'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors',
+                          'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds',
+                          'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings',
+                          'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12',
+                          'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5',
+                          'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10',
+                          'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8',
+                          'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
+                          'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8',
+                          'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
+                          'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9',
+                          'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH',
+                          'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine',
+                          'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2',
+                          'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide',
+                          'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide',
+                          'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo',
+                          'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido',
+                          'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
+                          'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
+                          'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho',
+                          'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
+                          'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine',
+                          'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd',
+                          'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan',
+                          'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed']
+    mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors)
+    list_of_descriptor_vals = list(mol_descriptor_calculator.CalcDescriptors(molecule))
+    return list_of_descriptor_vals
+
+
+class Mol2Vec:
+
+    PRETRAINED_WORD2VEC = 'examples/molecule_search/data/pretrained_models/model_300dim.pkl'
+    GITHUB_URL = 'https://github.com/samoturk/mol2vec/raw/master/examples/models/model_300dim.pkl'
+
+    def __init__(self):
+        self.file_path = os.path.join(project_root(), Mol2Vec.PRETRAINED_WORD2VEC)
+        download_from_github(self.file_path,
+                             Mol2Vec.GITHUB_URL,
+                             message="Downloading pretrained model for molecules encoding...")
+
+        self.model = word2vec.Word2Vec.load(self.file_path)
+
+    @adapter_method_to_molgraph
+    def __call__(self, obs: Any):
+        molecule = obs.get_rw_molecule()
+        sentence = MolSentence(mol2alt_sentence(molecule, radius=1))
+        embedding = self.sentences2vec([sentence], self.model, unseen='UNK')[0]
+        return np.array(embedding).astype(float)
+
+    @staticmethod
+    def sentences2vec(sentences: List[MolSentence], model: Word2Vec, unseen: Optional[str] = None) -> np.array:
+        """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
+        sum of vectors for individual words.
+
+        Parameters
+        ----------
+        sentences : list, array
+            List with sentences
+        model : word2vec.Word2Vec
+            Gensim word2vec model
+        unseen : None, str
+            Keyword for unseen words. If None, those words are skipped.
+            https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
+
+        Returns
+        -------
+        np.array
+        """
+
+        keys = set(model.wv.key_to_index)
+        vec = []
+
+        if unseen:
+            unseen_vec = model.wv.get_vector(unseen)
+
+        for sentence in sentences:
+            if unseen:
+                vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
+                                else unseen_vec for y in sentence]))
+            else:
+                vec.append(sum([model.wv.get_vector(y) for y in sentence
+                                if y in set(sentence) & keys]))
+        return np.array(vec)
+
+
+class MoleculeTransformer:
+    """ Based on https://github.com/mpcrlab/MolecularTransformerEmbeddings """
+
+    PRETRAINED_TRANSFORMER = 'examples/molecule_search/data/pretrained_models/pretrained.ckpt'
+    GITHUB_URL = 'https://github.com/mpcrlab/MolecularTransformerEmbeddings/releases/download/' \
+                 'checkpoints/pretrained.ckpt'
+
+    def __init__(self, embedding_size: int = 512, num_layers: int = 6, max_length: int = 256):
+        self.log = default_log(self)
+
+        self.file_path = os.path.join(project_root(), MoleculeTransformer.PRETRAINED_TRANSFORMER)
+        download_from_github(self.file_path,
+                             MoleculeTransformer.GITHUB_URL,
+                             message="Downloading pretrained model for molecules encoding...")
+        self.model = self._model_setup(embedding_size, num_layers)
+        self.encoder = self.model.encoder.cpu()
+        self.max_length = max_length
+
+    def _model_setup(self, embedding_size: int, num_layers: int):
+        model = Transformer(ALPHABET_SIZE, embedding_size, num_layers).eval()
+        model = torch.nn.DataParallel(model)
+        checkpoint = torch.load(self.file_path, map_location=torch.device("cpu"))
+        model.load_state_dict(checkpoint['state_dict'])
+        return model.module.cpu()
+
+    @adapter_method_to_molgraph
+    def __call__(self, obs: Any):
+        smiles = obs.get_smiles()
+        with torch.no_grad():
+            encoded = self.encode_smiles(smiles)
+            mask = create_masks(encoded)
+            embedding = self.encoder(encoded, mask)[0].numpy()
+            embedding = embedding.mean(axis=0)
+        return embedding
+
+    @staticmethod
+    def encode_char(c):
+        return ord(c) - 32
+
+    def encode_smiles(self, string: str, start_char=EXTRA_CHARS['seq_start']):
+        return torch.tensor([ord(start_char)] +
+                            [self.encode_char(c) for c in string], dtype=torch.long)[:self.max_length].unsqueeze(0)
diff --git a/examples/molecule_search/mol_metrics.py b/examples/molecule_search/mol_metrics.py
index 67804dbb..20fe1e47 100644
--- a/examples/molecule_search/mol_metrics.py
+++ b/examples/molecule_search/mol_metrics.py
@@ -1,18 +1,17 @@
 import os
 import pickle
 import sys
+from typing import Dict
 
-import requests
 from rdkit import RDConfig, Chem
 from rdkit.Chem import Descriptors, AllChem
 from rdkit.Chem.QED import qed
 from rdkit.Chem.rdchem import RWMol
-from typing import Dict
 
 from examples.molecule_search.constants import ZINC_LOGP_MEAN, ZINC_LOGP_STD, ZINC_SA_MEAN, ZINC_SA_STD, \
     ZINC_CYCLE_MEAN, ZINC_CYCLE_STD, MIN_LONG_CYCLE_SIZE
 from examples.molecule_search.mol_graph import MolGraph
-from examples.molecule_search.utils import largest_ring_size
+from examples.molecule_search.utils import largest_ring_size, download_from_github
 from golem.core.paths import project_root
 
 sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
@@ -160,13 +159,7 @@ def __call__(self, mol_graph: MolGraph) -> float:
         return -avg_score
 
     def load_shingles(self) -> Dict:
-        save_dir = os.path.dirname(self.file_path)
-        os.makedirs(save_dir, exist_ok=True)
-
-        if not os.path.exists(self.file_path):
-            response = requests.get(self.github_url)
-            with open(self.file_path, "wb") as new_file:
-                new_file.write(response.content)
+        download_from_github(self.file_path, self.github_url)
 
         with open(self.file_path, "rb") as pyc:
             db_shingles = pickle.load(pyc)
diff --git a/examples/molecule_search/mol_transformer/LICENSE.txt b/examples/molecule_search/mol_transformer/LICENSE.txt
new file mode 100644
index 00000000..fcbfc09f
--- /dev/null
+++ b/examples/molecule_search/mol_transformer/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Machine Perception & Cognitive Robotics Laboratory
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/examples/molecule_search/mol_transformer/__init__.py b/examples/molecule_search/mol_transformer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/molecule_search/mol_transformer/transformer.py b/examples/molecule_search/mol_transformer/transformer.py
new file mode 100644
index 00000000..1e13d0ca
--- /dev/null
+++ b/examples/molecule_search/mol_transformer/transformer.py
@@ -0,0 +1,240 @@
+import copy
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+"""Code from https://github.com/mpcrlab/MolecularTransformerEmbeddings"""
+
+PRINTABLE_ASCII_CHARS = 95
+
+_extra_chars = ["seq_start", "seq_end", "pad"]
+EXTRA_CHARS = {key: chr(PRINTABLE_ASCII_CHARS + i) for i, key in enumerate(_extra_chars)}
+ALPHABET_SIZE = PRINTABLE_ASCII_CHARS + len(EXTRA_CHARS)
+
+
+class Embedding(nn.Module):
+    def __init__(self, alphabet_size, d_model):
+        super().__init__()
+        self.alphabet_size = alphabet_size
+        self.d_model = d_model
+        self.embed = nn.Embedding(alphabet_size, d_model)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len=6000, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(p=dropout)
+        # create constant 'pe' matrix with values dependant on pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model)))
+                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        # add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:, :seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        x = self.dropout(x)
+        return x
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps=1e-6):
+        super().__init__()
+        self.size = d_model
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        self.eps = eps
+
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    scores = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        scores = dropout(scores)
+    output = torch.matmul(scores, v)
+    return output
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+
+    def forward(self, q, k, v, mask=None):
+        bs = q.size(0)
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1, 2)
+        q = q.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
+        output = self.out(concat)
+        return output
+
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout=0.1):
+        super().__init__()
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2, x2, x2, mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+
+
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+def get_clones(module, n):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
+
+
+class Encoder(nn.Module):
+    def __init__(self, alphabet_size, d_model, n, heads, dropout):
+        super().__init__()
+        self.n = n
+        self.embed = Embedding(alphabet_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), n)
+        self.norm = Norm(d_model)
+
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.n):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+
+
+class Decoder(nn.Module):
+    def __init__(self, alphabet_size, d_model, n, heads, dropout):
+        super().__init__()
+        self.n = n
+        self.embed = Embedding(alphabet_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), n)
+        self.norm = Norm(d_model)
+
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.n):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+
+class Transformer(nn.Module):
+    def __init__(self, alphabet_size, d_model, n, heads=8, dropout=0.1):
+        super().__init__()
+        self.encoder = Encoder(alphabet_size, d_model, n, heads, dropout)
+        self.decoder = Decoder(alphabet_size, d_model, n, heads, dropout)
+        self.out = nn.Linear(d_model, alphabet_size)
+
+    def forward(self, src, trg, src_mask, trg_mask):
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        output = self.out(d_output)
+        return output
+
+
+def nopeak_mask(size, device):
+    np_mask = torch.triu(torch.ones((size, size), dtype=torch.uint8), diagonal=1).unsqueeze(0)
+    np_mask = np_mask == 0
+    np_mask = np_mask.to(device)
+    return np_mask
+
+
+def create_masks(src, trg=None, pad_idx=ord(EXTRA_CHARS['pad']), device=None):
+    src_mask = (src != pad_idx).unsqueeze(-2)
+
+    if trg is not None:
+        trg_mask = (trg != pad_idx).unsqueeze(-2)
+        size = trg.size(1)  # get seq_len for matrix
+        np_mask = nopeak_mask(size, device)
+        np_mask.to(device)
+        trg_mask = trg_mask & np_mask
+        return src_mask, trg_mask
+    return src_mask
diff --git a/examples/molecule_search/utils.py b/examples/molecule_search/utils.py
index 31b58d38..74d3c10a 100644
--- a/examples/molecule_search/utils.py
+++ b/examples/molecule_search/utils.py
@@ -1,12 +1,14 @@
-from copy import deepcopy
+import os
+from typing import Tuple, Set, Optional
 
 import networkx as nx
 from rdkit.Chem import GetPeriodicTable
 from rdkit.Chem.rdchem import Atom, RWMol
-from typing import Tuple, Set
+from sphinx.util import requests
 
 from examples.molecule_search.constants import SULFUR_DEFAULT_VALENCE
 from examples.molecule_search.mol_graph import MolGraph
+from golem.core.log import default_log
 
 
 def get_default_valence(atom_type: str) -> int:
@@ -39,3 +41,17 @@ def largest_ring_size(rw_molecule: RWMol) -> int:
     if cycle_list:
         largest_cycle_len = max(map(len, cycle_list))
     return largest_cycle_len
+
+
+def download_from_github(save_path: str, github_url: str, message: Optional[str] = None):
+    """ Checks if the file exists. If not downloads the file from specified url."""
+    save_dir = os.path.dirname(save_path)
+    os.makedirs(save_dir, exist_ok=True)
+
+    message = message or f"Downloading a file from {github_url} to {save_dir}..."
+
+    if not os.path.exists(save_path):
+        default_log().message(message)
+        response = requests.get(github_url)
+        with open(save_path, "wb") as new_file:
+            new_file.write(response.content)
diff --git a/golem/core/optimisers/genetic/gp_params.py b/golem/core/optimisers/genetic/gp_params.py
index 42749027..d893cb6e 100644
--- a/golem/core/optimisers/genetic/gp_params.py
+++ b/golem/core/optimisers/genetic/gp_params.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Sequence, Union, Any
+from typing import Sequence, Union, Any, Callable
 
 from golem.core.optimisers.adaptive.operator_agent import MutationAgentTypeEnum
 from golem.core.optimisers.adaptive.mab_agents.neural_contextual_mab_agent import ContextAgentTypeEnum
@@ -76,7 +76,7 @@ class GPAlgorithmParameters(AlgorithmParameters):
     required_valid_ratio: float = 0.9
 
     adaptive_mutation_type: MutationAgentTypeEnum = MutationAgentTypeEnum.default
-    context_agent_type: ContextAgentTypeEnum = ContextAgentTypeEnum.nodes_num
+    context_agent_type: Union[ContextAgentTypeEnum, Callable] = ContextAgentTypeEnum.nodes_num
 
     selection_types: Sequence[SelectionTypesEnum] = \
         (SelectionTypesEnum.tournament,)
diff --git a/other_requirements/molecules.txt b/other_requirements/molecules.txt
index 73f2a0bc..e8021e9b 100644
--- a/other_requirements/molecules.txt
+++ b/other_requirements/molecules.txt
@@ -1,4 +1,7 @@
 rdkit>=2018.09.1.0
 guacamol>=0.5.4
 joblib>=0.12.5
-requests>=2.30.0
\ No newline at end of file
+requests>=2.30.0
+mol2vec @ git+https://github.com/samoturk/mol2vec
+gensim>=4.3.2
+torch>=2.0.1
\ No newline at end of file