From 354a64e958498f116ece288d7b3d54d15f9cee40 Mon Sep 17 00:00:00 2001 From: synthia <124930260+nthiad@users.noreply.github.com> Date: Thu, 1 Jun 2023 09:41:18 -0700 Subject: [PATCH] ia() and get_parents() (#33) * ia() and get_parents() for #28 * filtering to ensure children and parents are candidate terms, not actual terms * ancestors_within_distance for max_distance param to get_parents() --- manas_cafa5/protein.py | 53 +++++++++++++++++++++++++++++++++++------- manas_cafa5/utils.py | 10 ++++++++ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/manas_cafa5/protein.py b/manas_cafa5/protein.py index 44ef2ef..539f73c 100644 --- a/manas_cafa5/protein.py +++ b/manas_cafa5/protein.py @@ -1,10 +1,12 @@ from .structure import Structure, STRUCTURE_TERMS +from .utils import ancestors_within_distance import xml.parsers.expat as xml_parser import requests import re import numpy as np import obonet, networkx from functools import reduce +import math AMINO_ACID_LIST = 'ARNDCEQGHILKMFPSTWYV' AMINO_ACID_INDEX = { a: AMINO_ACID_LIST.find(a) for a in AMINO_ACID_LIST } @@ -76,15 +78,19 @@ def get_term_types(self): def get_terms(self, term_type): return self.terms.get(term_type.lower()) or [] - def get_children(self, term_type, graph, max_distance): - term_set = set() + def get_children(self, term_type, graph, max_distance=1): + terms_list = self.get_terms(term_type) + terms_set = set([ term['id'] for term in terms_list ]) + children = set() for dist in range(1,max_distance+1): - term_set = reduce( - lambda terms, term: terms.union( - networkx.descendants_at_distance(graph, term['id'], dist) - ), - self.get_terms(term_type), - term_set + children = reduce( + lambda terms, term: terms.union({ + child + for child in networkx.descendants_at_distance(graph, term['id'], dist) + if child not in terms_set + }), + terms_list, + children ) return [ { @@ -92,7 +98,28 @@ def get_children(self, term_type, graph, max_distance): 'id': term_id, 'properties': {}, } - for term_id in term_set + for term_id in children + ] + + def get_parents(self, term_type, graph, max_distance=1): + terms_list = self.get_terms(term_type) + terms_set = set([ term['id'] for term in terms_list ]) + parents = reduce( + lambda terms, term: terms.union({ + parent + for parent in ancestors_within_distance(graph, term['id'], max_distance) + if parent not in terms_set + }), + terms_list, + set() + ) + return [ + { + 'type': 'go', + 'id': term_id, + 'properties': {}, + } + for term_id in parents ] def go_terms(self): @@ -101,11 +128,19 @@ def go_terms(self): def go_terms_children(self, graph, max_distance): return self.get_children('go', graph, max_distance) + def go_terms_parents(self, graph, max_distance): + return self.get_parents('go', graph, max_distance) + @staticmethod def build_graph(url_or_file): # example url to use: https://current.geneontology.org/ontology/go-basic.obo return obonet.read_obo(url_or_file) + def ia(self, term_type, graph): + parent_count = float(len(self.get_parents(term_type, graph))) + term_count = float(len(self.get_terms(term_type))) + return math.log2((1.0 + parent_count) / (1.0 + term_count)) + def one_hot_sequence(self): n = len(self.sequence) seq = np.ndarray(shape=(n,20), dtype=float, order='C') diff --git a/manas_cafa5/utils.py b/manas_cafa5/utils.py index c5e6608..7a94874 100644 --- a/manas_cafa5/utils.py +++ b/manas_cafa5/utils.py @@ -1,5 +1,7 @@ +import networkx import requests, ftplib from urllib.parse import urlparse +from functools import reduce def fetch_url(url): if url.find('ftp:') == 0: @@ -16,3 +18,11 @@ def fetch_url(url): if r.status_code != 200: raise RuntimeError(f'unexpected status code while fetching {url}: {r.status_code}') return r.content + +def ancestors_within_distance(graph, term, max_distance): + parents = set([term]) + for dist in range(1,max_distance+1): + for parent in parents: + parents = parents.union(networkx.ancestors(graph, parent)) + parents.discard(term) + return parents