From d341b2a67fd06aeb9256aa8cd78332e282a3d838 Mon Sep 17 00:00:00 2001 From: Steve Sansom Date: Wed, 14 Feb 2018 00:16:45 +0000 Subject: [PATCH 1/2] Fix GO.py for dumping ensembl annotations for versions > 88. After version 88 the ID mappings were being lost in translation (in the translation table). Mapping between translation.translation_id and object_xref.ensembl_id no longer works. Instead, mapping transcript.transcript_id and object_xref.ensembl_id appears to now work as expected (returning slightly more GO ID's for successively newer annotation sets). --- CGAT/GO.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/CGAT/GO.py b/CGAT/GO.py index f94cc4c1..3c7f656e 100644 --- a/CGAT/GO.py +++ b/CGAT/GO.py @@ -430,7 +430,7 @@ def MapGO2Slims(gene2go, go2slim, ontology=None): """filter gene2go lookup by a list of go_ids in go2slim. gene2go: map of genes to go terms - go2slim: map of go categories to goslim go categories + go2slim: map of go categories to goslim go categories If ontology is given, missing descriptions of go entries are added from the ontology. @@ -645,7 +645,7 @@ def GetGOStatement(go_type, database, species): go_field = "acc" statement = """SELECT DISTINCTROW g.stable_id, xref.dbprimary_acc, go.name, 'NA' - FROM gene, transcript, translation, + FROM gene, transcript, translation, gene_stable_id as g, object_xref as o, xref, %(go_database)s.term AS go WHERE gene.gene_id = transcript.gene_id @@ -664,7 +664,7 @@ def GetGOStatement(go_type, database, species): statement = """SELECT DISTINCTROW g.stable_id, xref.dbprimary_acc, go.name, 'NA' - FROM gene, transcript, translation, + FROM gene, transcript, translation, gene_stable_id as g, object_xref as o, xref, %(go_database)s.term AS go, %(go_database)s.ontology AS ontology @@ -674,19 +674,18 @@ def GetGOStatement(go_type, database, species): AND translation.translation_id = o.ensembl_id AND xref.xref_id = o.xref_id AND go.%(go_field)s = xref.dbprimary_acc - AND go.ontology_id = ontology.ontology_id + AND go.ontology_id = ontology.ontology_id AND ontology.namespace = '%(go_type)s' AND xref.external_db_id = 1000 """ % locals() - else: - + elif version <= 88: go_database = "ensembl_ontology_%s" % version go_field = "accession" statement = """SELECT DISTINCTROW gene.stable_id, xref.dbprimary_acc, go.name, 'NA' - FROM gene, transcript, translation, + FROM gene, transcript, translation, object_xref as o, xref, %(go_database)s.term AS go, %(go_database)s.ontology AS ontology @@ -695,10 +694,30 @@ def GetGOStatement(go_type, database, species): AND translation.translation_id = o.ensembl_id AND xref.xref_id = o.xref_id AND go.%(go_field)s = xref.dbprimary_acc - AND go.ontology_id = ontology.ontology_id + AND go.ontology_id = ontology.ontology_id AND ontology.namespace = '%(go_type)s' AND xref.external_db_id = 1000 """ % locals() + + else: + go_database = "ensembl_ontology_%s" % version + go_field = "accession" + + statement = """SELECT DISTINCTROW + gene.stable_id, xref.dbprimary_acc, go.name, 'NA' + FROM gene, transcript, + object_xref as o, xref, + %(go_database)s.term AS go, + %(go_database)s.ontology AS ontology + WHERE gene.gene_id = transcript.gene_id + AND transcript.transcript_id = o.ensembl_id + AND xref.xref_id = o.xref_id + AND go.%(go_field)s = xref.dbprimary_acc + AND go.ontology_id = ontology.ontology_id + AND ontology.namespace = '%(go_type)s' + AND xref.external_db_id = 1000 + """ % locals() + else: raise "unknown ensmart version %s" % database @@ -875,7 +894,7 @@ def countGOs(gene2gos): def ReadGeneLists(filename_genes, gene_pattern=None): """read gene lists from filename in matrix. - returns a tuple (list of all genes, dictionary of gene lists) + returns a tuple (list of all genes, dictionary of gene lists) """ if filename_genes == "-": @@ -910,7 +929,7 @@ def ReadGeneLists(filename_genes, gene_pattern=None): def buildGO2Genes(gene2gos, ancestors=None): - '''invert the dictionary genes2go. + '''invert the dictionary genes2go. If ancestors is given, add missing ancestral information. ''' @@ -1014,7 +1033,7 @@ def outputResults(outfile, '''output GO results to outfile. If foreground is given, output a list of gene identifiers in the - foreground. + foreground. If gene2name is given, output a columns with gene names (instead of identifiers) @@ -1313,7 +1332,7 @@ def computeFDRs(go_results, def getFileName(options, **kwargs): '''return a filename - Placeholders in filename are string-substituted with the + Placeholders in filename are string-substituted with the dictionary in kwargs. ''' if options.output_filename_pattern: @@ -1438,7 +1457,7 @@ def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info, The purpose of this method is to find if there are categories that are differently enriched in a pair of gene lists. - The appropriate test here is the Chi-Squared test. + The appropriate test here is the Chi-Squared test. The assumption is that the background set is the same in all gene lists. From 26afd1ee22263ac438069371e585a36cba0c6831 Mon Sep 17 00:00:00 2001 From: Kevin Rue-Albrecht Date: Wed, 14 Feb 2018 10:16:58 +0000 Subject: [PATCH 2/2] filter for transcript present in translation table --- CGAT/GO.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CGAT/GO.py b/CGAT/GO.py index 3c7f656e..9f9d2edd 100644 --- a/CGAT/GO.py +++ b/CGAT/GO.py @@ -705,12 +705,14 @@ def GetGOStatement(go_type, database, species): statement = """SELECT DISTINCTROW gene.stable_id, xref.dbprimary_acc, go.name, 'NA' - FROM gene, transcript, + FROM gene, transcript, translation, object_xref as o, xref, %(go_database)s.term AS go, %(go_database)s.ontology AS ontology WHERE gene.gene_id = transcript.gene_id + AND transcript.transcript_id = translation.transcript_id AND transcript.transcript_id = o.ensembl_id + AND o.ensembl_object_type = 'Transcript' AND xref.xref_id = o.xref_id AND go.%(go_field)s = xref.dbprimary_acc AND go.ontology_id = ontology.ontology_id