Covalent ligand example + bugfix (#244)

* Update README with covalent ligand example; rename some files * Fix dtypes * Improve logging * Typo fix + assert check * Correct restraint * Update README * README updates * Rename folder * Update README * Update README
chaidiscovery · Dec 12, 2024 · 696270c · 696270c
1 parent b6e7fa1
commit 696270c
Show file tree

Hide file tree

Showing 13 changed files with 57 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ We provide a [web server](https://lab.chaidiscovery.com) so you can test the Cha
 </p>
 
 ## Using experimental restraints
-Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) for details.
+Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts or covalent bonds at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) and [covalent bond documentation](examples/covalent_bonds/README.md) for details.
 
 <p align="center">
     <img src='assets/chailab_restraints_screenshot.png' height=400 >

diff --git a/chai_lab/chai1.py b/chai_lab/chai1.py
@@ -400,12 +400,17 @@ def make_all_atom_feature_context(
         if cov_a.numel() > 0 and cov_b.numel() > 0:
             orig_a, orig_b = merged_context.atom_covalent_bond_indices
             if orig_a.numel() == orig_b.numel() == 0:
-                merged_context.atom_covalent_bond_indices = (orig_a, orig_b)
+                merged_context.atom_covalent_bond_indices = (cov_a, cov_b)
             else:
                 merged_context.atom_covalent_bond_indices = (
                     torch.concatenate([orig_a, cov_a]),
                     torch.concatenate([orig_b, cov_b]),
                 )
+            assert (
+                merged_context.atom_covalent_bond_indices[0].numel()
+                == merged_context.atom_covalent_bond_indices[1].numel()
+                > 0
+            )
     else:
         restraint_context = RestraintContext.empty()
 

diff --git a/chai_lab/data/dataset/structure/all_atom_structure_context.py b/chai_lab/data/dataset/structure/all_atom_structure_context.py
@@ -105,7 +105,7 @@ def report_bonds(self) -> None:
             res_idx_b = self.token_residue_index[tok_b]
             resname_a = tensorcode_to_string(self.token_residue_name[tok_a])
             resname_b = tensorcode_to_string(self.token_residue_name[tok_b])
-            logging.info(
+            logger.info(
                 f"Bond {i} (asym res_idx resname): {asym_a} {res_idx_a} {resname_a} <> {asym_b} {res_idx_b} {resname_b}"
             )
 

diff --git a/chai_lab/data/dataset/structure/bond_utils.py b/chai_lab/data/dataset/structure/bond_utils.py
@@ -2,6 +2,7 @@
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 
+
 import torch
 from einops import rearrange
 from torch import Tensor
@@ -129,7 +130,7 @@ def get_atom_covalent_bond_pairs_from_constraints(
                 pass
             case _:
                 raise ValueError(f"Unrecognized pariwise interaction: {ctype}")
-    return torch.tensor(ret_a, dtype=torch.int), torch.tensor(ret_b, dtype=torch.int)
+    return torch.tensor(ret_a, dtype=torch.long), torch.tensor(ret_b, dtype=torch.long)
 
 
 @typecheck

diff --git a/examples/glycosylation/1ac5.fasta → examples/covalent_bonds/1ac5.fasta b/examples/glycosylation/1ac5.fasta → examples/covalent_bonds/1ac5.fasta
diff --git a/examples/glycosylation/bonds.restraints → examples/covalent_bonds/1ac5.restraints b/examples/glycosylation/bonds.restraints → examples/covalent_bonds/1ac5.restraints
diff --git a/examples/covalent_bonds/8cyo.fasta b/examples/covalent_bonds/8cyo.fasta
@@ -0,0 +1,4 @@
+>protein|8cyo-protein
+MKKGHHHHHHGAISLISALVRAHVDSNPAMTSLDYSRFQANPDYQMSGDDTQHIQQFYDLLTGSMEIIRGWAEKIPGFADLPKADQDLLFESAFLELFVLRLAYRSNPVEGKLIFCNGVVLHRLQCVRGFGEWIDSIVEFSSNLQNMNIDISAFSCIAALAMVTERHGLKEPKRVEELQNKIVNTLKDHVTFNNGGLNRPNYLSKLLGKLPELRTLCTQGLQRIFYLKLEDLVPPPAIIDKLFLDTLPF
+>ligand|8cyo-ligand
+c1cc(c(cc1OCC(=O)NCCS)Cl)Cl
diff --git a/examples/covalent_bonds/8cyo.restraints b/examples/covalent_bonds/8cyo.restraints
@@ -0,0 +1,2 @@
+chainA,res_idxA,chainB,res_idxB,connection_type,confidence,min_distance_angstrom,max_distance_angstrom,comment,restraint_id
+A,C217@SG,B,@S1,covalent,1.0,0.0,0.0,protein-ligand,bond1
diff --git a/examples/glycosylation/README.md → examples/covalent_bonds/README.md b/examples/glycosylation/README.md → examples/covalent_bonds/README.md
@@ -1,9 +1,9 @@
 # Working with bond restraints
 
-Chai-1 supports specifying covalent bonds as input, which specify covalent linkages between atoms in the folded complex. This is useful for specifying covalent modifications such as glycosylation events, which we demonstrate below, but can be generally used to specify arbitrary "non-canonical" bonds in a structure. 
+Chai-1 supports specifying covalent bonds as input, which specify covalent linkages between atoms in the folded complex. This is useful for specifying covalent modifications such as glycosylation events, but can be generally used to specify arbitrary "non-canonical" bonds in a structure; we demonstrate both cases below.
 
 A few notes:
-- Chai-1 was not trained on disulfide bonds, and we have not evaluated whether specifying such bond information yields expected behaviors. 
+- Chai-1 was not trained on intra-chain bonds (e.g., disulfides), and we have not evaluated whether specifying such bond information yields expected behaviors.
 - These bond restraints should not be used to specify modified amino acids that already have an associated CCD code; for these examples, include the modified residue's CCD code in parentheses directly in the sequence in place of its canonical residue, e.g., `RKDES(MSE)EES` to specify a selenomethionine at the 6th position.
 
 ## Glycans
@@ -61,12 +61,20 @@ NAG(4-1 NAG(4-1 BMA(3-1 MAN)(6-1 MAN)))
 ```
 This branched example has a root `NAG` ring followed by a `NAG` and a `BMA`, which then branches to two `MAN` rings. For additional examples of this syntax, please refer to the examples in `tests/test_glycans.py`.
 
-### Example
+### Glycan example
 
 We have included an example of how glycans can be specified under `predict_glycosylated.py` in this directory, along with its corresponding `bonds.restraints` csv file. This example is based on the PDB structure [1AC5](https://www.rcsb.org/structure/1ac5). The predicted structrue (colored, glycans in purple and orange, protein in green) from this script should look like the following when aligned with the ground truth 1AC5 structure (gray):
 
 ![glycan example prediction](./output.png)
 
-### A note on leaving atoms
+## Non-glycan ligands
+
+You can also use covalent bonds to "connect" residues to non-glycan ligands. To demonstrate this, we use a single subunit from the homodimer [8CYO](https://www.rcsb.org/structure/8cyo). We specify a fasta file with a protein sequence and SMILES string in `8cyo.fasta` (we use SMILES for demonstration purposes even though this specific ligand has a CCD code) and the corresponding restraints in `8cyo.restraints`. Folding this example with `predict_covalent_ligand.py` yields the following structure (RCSB ground truth structure shown in gray).
+
+![non glycan example prediction](./non_glycan_output.png)
+
+## A note on leaving atoms
+
+One might notice that in the above example, we are specifying CCD codes for sugar rings and connecting them to each other and an amino acid residue via various bonds. A subtle point is that the reference conformer for these sugar rings include OH hydroxyl groups that leave when bonds are formed. Under the hood, Chai-1 tries to automatically find and remove these atoms (see `AllAtomStructureContext.drop_glycan_leaving_atoms_inplace` for implementation), but this logic only drops leaving hydroxyl groups within glycan sugar rings. For other, non-sugar covalently attached ligands, please specify a SMILES string *without* the leaving atoms. If this does not work for your use case, please open a GitHub issue.
+
 
-One might notice that in the above example, we are specifying CCD codes for sugar rings and connecting them to each other and an amino acid residue via various bonds. A subtle point is that the reference conformer for these sugar rings include OH hydroxyl groups that leave when bonds are formed. Under the hood, Chai-1 tries to automatically find and remove these atoms (see `AllAtomStructureContext.drop_glycan_leaving_atoms_inplace` for implementation), but this logic only drops leaving hydroxyl groups within glycan sugar rings. For other, non-sugar covalently attached ligands, please specify a SMILES string without the leaving atoms. If this does not work for your use case, please open a GitHub issue.
diff --git a/examples/covalent_bonds/non_glycan_output.png b/examples/covalent_bonds/non_glycan_output.png
diff --git a/examples/glycosylation/output.png → examples/covalent_bonds/output.png b/examples/glycosylation/output.png → examples/covalent_bonds/output.png
diff --git a/examples/covalent_bonds/predict_covalent_ligand.py b/examples/covalent_bonds/predict_covalent_ligand.py
@@ -0,0 +1,27 @@
+import logging
+import shutil
+from pathlib import Path
+
+from chai_lab.chai1 import run_inference
+
+logging.basicConfig(level=logging.INFO)
+
+# Inference expects an empty directory; enforce this
+output_dir = Path("/workspaces/chai-lab/tmp/outputs")
+if output_dir.exists():
+    logging.warning(f"Removing old output directory: {output_dir}")
+    shutil.rmtree(output_dir)
+output_dir.mkdir(exist_ok=True, parents=True)
+
+candidates = run_inference(
+    fasta_file=Path(__file__).with_name("8cyo.fasta"),
+    output_dir=output_dir,
+    constraint_path=Path(__file__).with_name("8cyo.restraints"),
+    num_trunk_recycles=3,
+    num_diffn_timesteps=200,
+    seed=42,
+    device="cuda:0",
+    use_esm_embeddings=True,
+)
+
+cif_paths = candidates.cif_paths
diff --git a/...les/glycosylation/predict_glycosylated.py → ...es/covalent_bonds/predict_glycosylated.py b/...les/glycosylation/predict_glycosylated.py → ...es/covalent_bonds/predict_glycosylated.py
@@ -16,7 +16,7 @@
 candidates = run_inference(
     fasta_file=Path(__file__).with_name("1ac5.fasta"),
     output_dir=output_dir,
-    constraint_path=Path(__file__).with_name("bonds.restraints"),
+    constraint_path=Path(__file__).with_name("1ac5.restraints"),
     num_trunk_recycles=3,
     num_diffn_timesteps=200,
     seed=42,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		chainA,res_idxA,chainB,res_idxB,connection_type,confidence,min_distance_angstrom,max_distance_angstrom,comment,restraint_id
		A,C217@SG,B,@S1,covalent,1.0,0.0,0.0,protein-ligand,bond1