-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
159 lines (135 loc) · 5.89 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# delete this cell if working on Pycharm
# !pip install Bio
from Bio.PDB import *
import numpy as np
import os
from tqdm import tqdm
NB_MAX_LENGTH = 140
AA_DICT = {"A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7,
"K": 8, "L": 9, "M": 10, "N": 11,
"P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "W": 17, "Y": 18,
"V": 19, "X": 20, "-": 21}
FEATURE_NUM = len(AA_DICT)
BACKBONE_ATOMS = ["N", "CA", "C", "O", "CB"]
BACKBONE_ATOMS_DICT = {"N": [0, 1, 2], "CA": [3, 4, 5], "C": [6, 7, 8],
"O": [9, 10, 11],
"CB": [12, 13, 14]}
ROW_IND = np.arange(0, NB_MAX_LENGTH)
OUTPUT_SIZE = len(BACKBONE_ATOMS) * 3
NB_CHAIN_ID = "H"
def get_seq_aa(pdb_file, chain_id):
"""
returns the sequence (String) and a list of all the aa residue objects of the given protein chain.
:param pdb_file: path to a pdb file
:param chain_id: chain letter (char)
:return: sequence, [aa objects]
"""
# load model
chain = PDBParser(QUIET=True).get_structure(pdb_file, pdb_file)[0][chain_id]
aa_residues = []
seq = ""
for residue in chain.get_residues():
aa = residue.get_resname()
if not is_aa(aa) or not residue.has_id('CA'): # Not amino acid
continue
elif aa == "UNK": # unkown amino acid
seq += "X"
else:
seq += Polypeptide.three_to_one(residue.get_resname())
aa_residues.append(residue)
return seq, aa_residues
def generate_input(pdb_file):
"""
receives a pdb file and returns its sequence in a one-hot encoding matrix (each row is an aa in the sequence, and
each column represents a different aa out of the 20 aa + 2 special columns).
:param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
:return: numpy array of shape (NB_MAX_LENGTH, FEATURE_NUM)
"""
# get seq and aa residues
seq, _ = get_seq_aa(pdb_file, NB_CHAIN_ID)
if len(seq) > NB_MAX_LENGTH:
seq = seq[:NB_MAX_LENGTH]
if len(seq) < NB_MAX_LENGTH:
seq = seq.ljust(NB_MAX_LENGTH, '-')
indices = [AA_DICT[c] for c in seq]
matrix = np.zeros((NB_MAX_LENGTH, FEATURE_NUM))
matrix[ROW_IND, indices] = 1
return matrix
def generate_ind(pdb_file):
"""
receives a pdb file and returns its sequence and the ind list representing the matrix
:param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
:return: sequence, ind list representing the matrix
"""
# get seq and aa residues
seq_r, _ = get_seq_aa(pdb_file, NB_CHAIN_ID)
seq = seq_r
if len(seq) > NB_MAX_LENGTH:
seq = seq[:NB_MAX_LENGTH]
if len(seq) < NB_MAX_LENGTH:
seq = seq.ljust(NB_MAX_LENGTH, '-')
return seq_r, [AA_DICT[c] for c in seq]
def generate_label(pdb_file):
"""
receives a pdb file and returns its backbone + CB coordinates.
:param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H') already alingned to a reference nanobody.
:return: numpy array of shape (CDR_MAX_LENGTH, OUTPUT_SIZE).
"""
# get seq and aa residues
seq, aa_residues = get_seq_aa(pdb_file, NB_CHAIN_ID)
if len(aa_residues) > 140:
aa_residues = aa_residues[:140]
matrix = np.zeros((NB_MAX_LENGTH, OUTPUT_SIZE))
for i, r in enumerate(aa_residues):
for part, indices in BACKBONE_ATOMS_DICT.items():
if part == "CB" and r.resname == "GLY":
continue
matrix[i,indices]=r[part].coord
return matrix
def matrix_to_pdb(seq, coord_matrix, pdb_name):
"""
Receives a sequence (String) and the output matrix of the neural network (coord_matrix, numpy array)
and creates from them a PDB file named pdb_name.pdb.
:param seq: protein sequence (String), with no padding
:param coord_matrix: output np array of the nanobody neural network, shape = (NB_MAX_LENGTH, OUTPUT_SIZE)
:param pdb_name: name of the output PDB file (String)
"""
ATOM_LINE = "ATOM{}{} {}{}{} {}{}{}{}{:.3f}{}{:.3f}{}{:.3f} 1.00{}{:.2f} {}\n"
END_LINE = "END\n"
k = 1
with open(f"{pdb_name}.pdb", "w") as pdb_file:
for i, aa in enumerate(seq):
third_space = (4 - len(str(i))) * " "
for j, atom in enumerate(BACKBONE_ATOMS):
if not (aa == "G" and atom == "CB"): # GLY lacks CB atom
x, y, z = coord_matrix[i][3 * j], coord_matrix[i][
3 * j + 1], coord_matrix[i][3 * j + 2]
b_factor = 0.00
first_space = (7 - len(str(k))) * " "
second_space = (4 - len(atom)) * " "
forth_space = (12 - len("{:.3f}".format(x))) * " "
fifth_space = (8 - len("{:.3f}".format(y))) * " "
sixth_space = (8 - len("{:.3f}".format(z))) * " "
seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "
pdb_file.write(
ATOM_LINE.format(first_space, k, atom, second_space,
Polypeptide.one_to_three(aa), "H",
third_space,
i, forth_space, x, fifth_space, y,
sixth_space, z, seventh_space,
b_factor, atom[0]))
k += 1
pdb_file.write(END_LINE)
if __name__ == '__main__':
input_matrix = []
labels_matrix = []
data_path = "Ex4Data"
for pdb in tqdm(os.listdir(data_path)):
nb_one_hot = generate_input(os.path.join(data_path, pdb))
nb_xyz = generate_label(os.path.join(data_path, pdb))
input_matrix.append(nb_one_hot)
labels_matrix.append(nb_xyz)
save_path = "Ex4Files"
np.save(f"train_input.npy", np.array(input_matrix))
np.save(f"train_labels.npy", np.array(labels_matrix))
print(f"Number of samples: {len(input_matrix)}")