Skip to content

Commit

Permalink
ia() and get_parents() (#33)
Browse files Browse the repository at this point in the history
* ia() and get_parents() for #28

* filtering to ensure children and parents are candidate terms, not actual terms

* ancestors_within_distance for max_distance param to get_parents()
  • Loading branch information
nthiad authored Jun 1, 2023
1 parent 2db512e commit 354a64e
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 9 deletions.
53 changes: 44 additions & 9 deletions manas_cafa5/protein.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .structure import Structure, STRUCTURE_TERMS
from .utils import ancestors_within_distance
import xml.parsers.expat as xml_parser
import requests
import re
import numpy as np
import obonet, networkx
from functools import reduce
import math

AMINO_ACID_LIST = 'ARNDCEQGHILKMFPSTWYV'
AMINO_ACID_INDEX = { a: AMINO_ACID_LIST.find(a) for a in AMINO_ACID_LIST }
Expand Down Expand Up @@ -76,23 +78,48 @@ def get_term_types(self):
def get_terms(self, term_type):
return self.terms.get(term_type.lower()) or []

def get_children(self, term_type, graph, max_distance):
term_set = set()
def get_children(self, term_type, graph, max_distance=1):
terms_list = self.get_terms(term_type)
terms_set = set([ term['id'] for term in terms_list ])
children = set()
for dist in range(1,max_distance+1):
term_set = reduce(
lambda terms, term: terms.union(
networkx.descendants_at_distance(graph, term['id'], dist)
),
self.get_terms(term_type),
term_set
children = reduce(
lambda terms, term: terms.union({
child
for child in networkx.descendants_at_distance(graph, term['id'], dist)
if child not in terms_set
}),
terms_list,
children
)
return [
{
'type': 'go',
'id': term_id,
'properties': {},
}
for term_id in term_set
for term_id in children
]

def get_parents(self, term_type, graph, max_distance=1):
terms_list = self.get_terms(term_type)
terms_set = set([ term['id'] for term in terms_list ])
parents = reduce(
lambda terms, term: terms.union({
parent
for parent in ancestors_within_distance(graph, term['id'], max_distance)
if parent not in terms_set
}),
terms_list,
set()
)
return [
{
'type': 'go',
'id': term_id,
'properties': {},
}
for term_id in parents
]

def go_terms(self):
Expand All @@ -101,11 +128,19 @@ def go_terms(self):
def go_terms_children(self, graph, max_distance):
return self.get_children('go', graph, max_distance)

def go_terms_parents(self, graph, max_distance):
return self.get_parents('go', graph, max_distance)

@staticmethod
def build_graph(url_or_file):
# example url to use: https://current.geneontology.org/ontology/go-basic.obo
return obonet.read_obo(url_or_file)

def ia(self, term_type, graph):
parent_count = float(len(self.get_parents(term_type, graph)))
term_count = float(len(self.get_terms(term_type)))
return math.log2((1.0 + parent_count) / (1.0 + term_count))

def one_hot_sequence(self):
n = len(self.sequence)
seq = np.ndarray(shape=(n,20), dtype=float, order='C')
Expand Down
10 changes: 10 additions & 0 deletions manas_cafa5/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import networkx
import requests, ftplib
from urllib.parse import urlparse
from functools import reduce

def fetch_url(url):
if url.find('ftp:') == 0:
Expand All @@ -16,3 +18,11 @@ def fetch_url(url):
if r.status_code != 200:
raise RuntimeError(f'unexpected status code while fetching {url}: {r.status_code}')
return r.content

def ancestors_within_distance(graph, term, max_distance):
parents = set([term])
for dist in range(1,max_distance+1):
for parent in parents:
parents = parents.union(networkx.ancestors(graph, parent))
parents.discard(term)
return parents

0 comments on commit 354a64e

Please sign in to comment.