From 32dfb29152e8f581398dc6893eecc92347a1f2a8 Mon Sep 17 00:00:00 2001
From: Johannes Rainer <johannes.rainer@gmail.com>
Date: Wed, 21 Aug 2024 11:21:31 +0200
Subject: [PATCH] Update proteins vignette

---
 vignettes/proteins.Rmd | 72 +++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/vignettes/proteins.Rmd b/vignettes/proteins.Rmd
index de460fd..d578f55 100644
--- a/vignettes/proteins.Rmd
+++ b/vignettes/proteins.Rmd
@@ -16,7 +16,7 @@ vignette: >
 ```{r  biocstyle, echo = FALSE, results = "asis", message = FALSE }
 library(BiocStyle)
 library(ensembldb)
-BiocStyle::markdown() 
+BiocStyle::markdown()
 ```
 
 
@@ -55,44 +55,40 @@ databases created through the Ensembl Perl API contain protein annotation, while
 databases created using `ensDbFromAH`, `ensDbFromGff`, `ensDbFromGRanges` and
 `ensDbFromGtf` don't.
 
-```{r  doeval, echo = FALSE, results = "hide" }
+```{r doeval, echo = FALSE, results = "hide"}
 ## Globally switch off execution of code chunks
 evalMe <- TRUE
 haveProt <- FALSE
 ## evalMe <- .Platform$OS.type == "unix"
- 
 ```
 
-```{r  loadlib, message = FALSE, eval = evalMe }
+```{r loadlib, message = FALSE, eval = evalMe}
 library(ensembldb)
 library(EnsDb.Hsapiens.v86)
 edb <- EnsDb.Hsapiens.v86
 ## Evaluate whether we have protein annotation available
 hasProteinData(edb)
- 
 ```
 
-```{r  restrict9, message = FALSE, echo = FALSE }
+```{r restrict9, message = FALSE, echo = FALSE, eval = evalMe}
 ## silently subsetting to chromosome 11
-edb <- filter(edb, filter = ~ seq_name == "11") 
+edb <- filter(edb, filter = ~ seq_name == "11")
 ```
 
 If protein annotation is available, the additional tables and columns are also
 listed by the `listTables` and `listColumns` methods.
 
-```{r  listCols, message = FALSE, eval = evalMe }
+```{r listCols, message = FALSE, eval = evalMe}
 listTables(edb)
- 
 ```
 
 In the following sections we show examples how to 1) fetch protein annotations
 as additional columns to gene/transcript annotations, 2) fetch protein
 annotation data and 3) map proteins to the genome.
 
-```{r  haveprot, echo = FALSE, results = "hide", eval = evalMe }
+```{r haveprot, echo = FALSE, results = "hide", eval = evalMe}
 ## Use this to conditionally disable eval on following chunks
 haveProt <- hasProteinData(edb) & evalMe
- 
 ```
 
 
@@ -102,12 +98,11 @@ Protein annotations for (protein coding) transcripts can be retrieved by simply
 adding the desired annotation columns to the `columns` parameter of the e.g. `genes`
 or `transcripts` methods.
 
-```{r  a_transcripts, eval = haveProt }
+```{r a_transcripts, eval = haveProt}
 ## Get also protein information for ZBTB16 transcripts
 txs <- transcripts(edb, filter = GeneNameFilter("ZBTB16"),
 		   columns = c("protein_id", "uniprot_id", "tx_biotype"))
 txs
- 
 ```
 
 The gene ZBTB16 has protein coding and non-coding transcripts, thus, we get the
@@ -115,11 +110,11 @@ protein ID for the coding- and `NA` for the non-coding transcripts. Note also th
 we have a transcript targeted for nonsense mediated mRNA-decay with a protein ID
 associated with it, but no Uniprot ID.
 
-```{r  a_transcripts_coding_noncoding, eval = haveProt }
+```{r a_transcripts_coding_noncoding, eval = haveProt}
 ## Subset to transcripts with tx_biotype other than protein_coding.
 txs[txs$tx_biotype != "protein_coding", c("uniprot_id", "tx_biotype",
 					  "protein_id")]
- 
+
 ```
 
 While the mapping from a protein coding transcript to a Ensembl protein ID
@@ -130,11 +125,11 @@ each Uniprot ID can be mapped to more than one `protein_id` (and hence
 fetching Uniprot related additional columns or even protein ID features, as in
 such cases a redundant list of transcripts is returned.
 
-```{r  a_transcripts_coding, eval = haveProt }
+```{r a_transcripts_coding, eval = haveProt}
 ## List the protein IDs and uniprot IDs for the coding transcripts
 mcols(txs[txs$tx_biotype == "protein_coding",
 	  c("tx_id", "protein_id", "uniprot_id")])
- 
+
 ```
 
 Some of the n:m mappings for Uniprot IDs can be resolved by restricting either
@@ -144,7 +139,7 @@ certain type of mapping method. The corresponding filters are the
 `uniprot_mapping_type` columns of the `uniprot` database table). In the example
 below we restrict the result to Uniprot IDs with the mapping type *DIRECT*.
 
-```{r  a_transcripts_coding_up, eval = haveProt }
+```{r a_transcripts_coding_up, eval = haveProt}
 ## List all uniprot mapping types in the database.
 listUniprotMappingTypes(edb)
 
@@ -155,7 +150,7 @@ txs <- transcripts(edb, filter = list(GeneNameFilter("ZBTB16"),
 				      UniprotMappingTypeFilter("DIRECT")),
 		   columns = c("protein_id", "uniprot_id", "uniprot_db"))
 mcols(txs)
- 
+
 ```
 
 For this example the use of the `UniprotMappingTypeFilter` resolved the multiple
@@ -174,14 +169,14 @@ protein data to filter the results. In the example below we fetch for example
 all genes from the database that have a certain protein domain in the protein
 encoded by any of its transcripts.
 
-```{r  a_genes_protdomid_filter, eval = haveProt }
-## Get all genes encoded on chromosome 11 which protein contains 
+```{r a_genes_protdomid_filter, eval = haveProt}
+## Get all genes encoded on chromosome 11 which protein contains
 ## a certain protein domain.
 gns <- genes(edb, filter = ~ prot_dom_id == "PS50097" & seq_name == "11")
 length(gns)
 
 sort(gns$gene_name)
- 
+
 ```
 
 So, in total we got 152 genes with that protein domain. In addition to the
@@ -196,21 +191,21 @@ The `select`, `keys` and `mapIds` methods from the `AnnotationDbi` package can a
 used to query `EnsDb` objects for protein annotations. Supported columns and
 key types are returned by the `columns` and `keytypes` methods.
 
-```{r  a_2_annotationdbi, message = FALSE, eval = haveProt }
+```{r a_2_annotationdbi, message = FALSE, eval = haveProt}
 ## Show all columns that are provided by the database
 columns(edb)
 
 ## Show all key types/filters that are supported
 keytypes(edb)
- 
+
 ```
 
 Below we fetch all Uniprot IDs annotated to the gene *ZBTB16*.
 
-```{r  a_2_select, message = FALSE, eval = haveProt }
+```{r a_2_select, message = FALSE, eval = haveProt}
 select(edb, keys = "ZBTB16", keytype = "GENENAME",
        columns = "UNIPROTID")
- 
+
 ```
 
 This returns us all Uniprot IDs of all proteins encoded by the gene's
@@ -219,11 +214,11 @@ annotated to a protein, does not have an Uniprot ID assigned (thus `NA` is
 returned by the above call). As we see below, this transcript is targeted for
 non sense mediated mRNA decay.
 
-```{r  a_2_select_nmd, message = FALSE, eval = haveProt }
+```{r a_2_select_nmd, message = FALSE, eval = haveProt}
 ## Call select, this time providing a GeneNameFilter.
 select(edb, keys = GeneNameFilter("ZBTB16"),
        columns = c("TXBIOTYPE", "UNIPROTID", "PROTEINID"))
- 
+
 ```
 
 Note also that we passed this time a `GeneMameFilter` with the `keys` parameter.
@@ -240,12 +235,12 @@ protein annotations becomes available.
 
 In the code chunk below we fetch all protein annotations for the gene *ZBTB16*.
 
-```{r  b_proteins, message = FALSE, eval = haveProt }
+```{r b_proteins, message = FALSE, eval = haveProt}
 ## Get all proteins and return them as an AAStringSet
 prts <- proteins(edb, filter = GeneNameFilter("ZBTB16"),
 		 return.type = "AAStringSet")
 prts
- 
+
 ```
 
 Besides the amino acid sequence, the `prts` contains also additional annotations
@@ -253,9 +248,9 @@ that can be accessed with the `mcols` method (metadata columns). All additional
 columns provided with the parameter `columns` are also added to the `mcols`
 `DataFrame`.
 
-```{r  b_proteins_mcols, message = FALSE, eval = haveProt }
+```{r b_proteins_mcols, message = FALSE, eval = haveProt}
 mcols(prts)
- 
+
 ```
 
 Note that the `proteins` method will retrieve only gene/transcript annotations of
@@ -266,26 +261,26 @@ previous section are not fetched.
 Querying in addition Uniprot identifiers or protein domain data will result at
 present in a redundant list of proteins as shown in the code block below.
 
-```{r  b_proteins_prot_doms, message = FALSE, eval = haveProt }
+```{r b_proteins_prot_doms, message = FALSE, eval = haveProt}
 ## Get also protein domain annotations in addition to the protein annotations.
 pd <- proteins(edb, filter = GeneNameFilter("ZBTB16"),
 	       columns = c("tx_id", listColumns(edb, "protein_domain")),
 	       return.type = "AAStringSet")
 pd
- 
+
 ```
 
 The result contains one row/element for each protein domain in each of the
 proteins. The number of protein domains per protein and the `mcols` are shown
 below.
 
-```{r  b_proteins_prot_doms_2, message = FALSE, eval = haveProt }
+```{r b_proteins_prot_doms_2, message = FALSE, eval = haveProt}
 ## The number of protein domains per protein:
 table(names(pd))
 
 ## The mcols
 mcols(pd)
- 
+
 ```
 
 As we can see each protein can have several protein domains with the start and
@@ -302,7 +297,6 @@ functions that allow to map between genomic, transcript and protein coordinates.
 
 # Session information
 
-```{r  sessionInfo }
-sessionInfo() 
+```{r sessionInfo}
+sessionInfo()
 ```
-