From d341b2a67fd06aeb9256aa8cd78332e282a3d838 Mon Sep 17 00:00:00 2001
From: Steve Sansom <stephen.sansom@kennedy.ox.ac.uk>
Date: Wed, 14 Feb 2018 00:16:45 +0000
Subject: [PATCH 1/2] Fix GO.py for dumping ensembl annotations for versions >
 88.

After version 88 the ID mappings were being lost in translation (in the translation table).

Mapping between translation.translation_id and object_xref.ensembl_id no longer works. Instead, mapping transcript.transcript_id and object_xref.ensembl_id appears to now work as expected (returning slightly more GO ID's for successively newer annotation sets).
---
 CGAT/GO.py | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/CGAT/GO.py b/CGAT/GO.py
index f94cc4c1..3c7f656e 100644
--- a/CGAT/GO.py
+++ b/CGAT/GO.py
@@ -430,7 +430,7 @@ def MapGO2Slims(gene2go, go2slim, ontology=None):
     """filter gene2go lookup by a list of go_ids in go2slim.
 
     gene2go: map of genes to go terms
-    go2slim: map of go categories to goslim go categories   
+    go2slim: map of go categories to goslim go categories
 
     If ontology is given, missing descriptions of go entries
     are added from the ontology.
@@ -645,7 +645,7 @@ def GetGOStatement(go_type, database, species):
             go_field = "acc"
             statement = """SELECT DISTINCTROW
         g.stable_id, xref.dbprimary_acc, go.name, 'NA'
-        FROM gene, transcript, translation, 
+        FROM gene, transcript, translation,
         gene_stable_id as g, object_xref as o, xref,
         %(go_database)s.term AS go
         WHERE gene.gene_id = transcript.gene_id
@@ -664,7 +664,7 @@ def GetGOStatement(go_type, database, species):
 
             statement = """SELECT DISTINCTROW
         g.stable_id, xref.dbprimary_acc, go.name, 'NA'
-        FROM gene, transcript, translation, 
+        FROM gene, transcript, translation,
         gene_stable_id as g, object_xref as o, xref,
         %(go_database)s.term AS go,
         %(go_database)s.ontology AS ontology
@@ -674,19 +674,18 @@ def GetGOStatement(go_type, database, species):
         AND translation.translation_id = o.ensembl_id
         AND xref.xref_id = o.xref_id
         AND go.%(go_field)s = xref.dbprimary_acc
-        AND go.ontology_id = ontology.ontology_id 
+        AND go.ontology_id = ontology.ontology_id
         AND ontology.namespace = '%(go_type)s'
         AND xref.external_db_id = 1000
         """ % locals()
 
-        else:
-
+        elif version <= 88:
             go_database = "ensembl_ontology_%s" % version
             go_field = "accession"
 
             statement = """SELECT DISTINCTROW
         gene.stable_id, xref.dbprimary_acc, go.name, 'NA'
-        FROM gene, transcript, translation, 
+        FROM gene, transcript, translation,
         object_xref as o, xref,
         %(go_database)s.term AS go,
         %(go_database)s.ontology AS ontology
@@ -695,10 +694,30 @@ def GetGOStatement(go_type, database, species):
         AND translation.translation_id = o.ensembl_id
         AND xref.xref_id = o.xref_id
         AND go.%(go_field)s = xref.dbprimary_acc
-        AND go.ontology_id = ontology.ontology_id 
+        AND go.ontology_id = ontology.ontology_id
         AND ontology.namespace = '%(go_type)s'
         AND xref.external_db_id = 1000
         """ % locals()
+
+        else:
+            go_database = "ensembl_ontology_%s" % version
+            go_field = "accession"
+
+            statement = """SELECT DISTINCTROW
+        gene.stable_id, xref.dbprimary_acc, go.name, 'NA'
+        FROM gene, transcript,
+        object_xref as o, xref,
+        %(go_database)s.term AS go,
+        %(go_database)s.ontology AS ontology
+        WHERE gene.gene_id = transcript.gene_id
+        AND transcript.transcript_id = o.ensembl_id
+        AND xref.xref_id = o.xref_id
+        AND go.%(go_field)s = xref.dbprimary_acc
+        AND go.ontology_id = ontology.ontology_id
+        AND ontology.namespace = '%(go_type)s'
+        AND xref.external_db_id = 1000
+        """ % locals()
+
     else:
         raise "unknown ensmart version %s" % database
 
@@ -875,7 +894,7 @@ def countGOs(gene2gos):
 def ReadGeneLists(filename_genes, gene_pattern=None):
     """read gene lists from filename in matrix.
 
-    returns a tuple (list of all genes, dictionary of gene lists) 
+    returns a tuple (list of all genes, dictionary of gene lists)
     """
 
     if filename_genes == "-":
@@ -910,7 +929,7 @@ def ReadGeneLists(filename_genes, gene_pattern=None):
 
 
 def buildGO2Genes(gene2gos, ancestors=None):
-    '''invert the dictionary genes2go. 
+    '''invert the dictionary genes2go.
 
     If ancestors is given, add missing ancestral information.
     '''
@@ -1014,7 +1033,7 @@ def outputResults(outfile,
     '''output GO results to outfile.
 
     If foreground is given, output a list of gene identifiers in the
-    foreground. 
+    foreground.
 
     If gene2name is given, output a columns with gene
     names (instead of identifiers)
@@ -1313,7 +1332,7 @@ def computeFDRs(go_results,
 def getFileName(options, **kwargs):
     '''return a filename
 
-    Placeholders in filename are string-substituted with the 
+    Placeholders in filename are string-substituted with the
     dictionary in kwargs.
     '''
     if options.output_filename_pattern:
@@ -1438,7 +1457,7 @@ def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info,
     The purpose of this method is to find if there are categories that are differently enriched
     in a pair of gene lists.
 
-    The appropriate test here is the Chi-Squared test. 
+    The appropriate test here is the Chi-Squared test.
 
     The assumption is that the background set is the same in all gene lists.
 

From 26afd1ee22263ac438069371e585a36cba0c6831 Mon Sep 17 00:00:00 2001
From: Kevin Rue-Albrecht <kevinrue67@gmail.com>
Date: Wed, 14 Feb 2018 10:16:58 +0000
Subject: [PATCH 2/2] filter for transcript present in translation table

---
 CGAT/GO.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CGAT/GO.py b/CGAT/GO.py
index 3c7f656e..9f9d2edd 100644
--- a/CGAT/GO.py
+++ b/CGAT/GO.py
@@ -705,12 +705,14 @@ def GetGOStatement(go_type, database, species):
 
             statement = """SELECT DISTINCTROW
         gene.stable_id, xref.dbprimary_acc, go.name, 'NA'
-        FROM gene, transcript,
+        FROM gene, transcript, translation,
         object_xref as o, xref,
         %(go_database)s.term AS go,
         %(go_database)s.ontology AS ontology
         WHERE gene.gene_id = transcript.gene_id
+        AND transcript.transcript_id = translation.transcript_id
         AND transcript.transcript_id = o.ensembl_id
+        AND o.ensembl_object_type = 'Transcript'
         AND xref.xref_id = o.xref_id
         AND go.%(go_field)s = xref.dbprimary_acc
         AND go.ontology_id = ontology.ontology_id