From 32dfb29152e8f581398dc6893eecc92347a1f2a8 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 21 Aug 2024 11:21:31 +0200 Subject: [PATCH] Update proteins vignette --- vignettes/proteins.Rmd | 72 +++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/vignettes/proteins.Rmd b/vignettes/proteins.Rmd index de460fd..d578f55 100644 --- a/vignettes/proteins.Rmd +++ b/vignettes/proteins.Rmd @@ -16,7 +16,7 @@ vignette: > ```{r biocstyle, echo = FALSE, results = "asis", message = FALSE } library(BiocStyle) library(ensembldb) -BiocStyle::markdown() +BiocStyle::markdown() ``` @@ -55,44 +55,40 @@ databases created through the Ensembl Perl API contain protein annotation, while databases created using `ensDbFromAH`, `ensDbFromGff`, `ensDbFromGRanges` and `ensDbFromGtf` don't. -```{r doeval, echo = FALSE, results = "hide" } +```{r doeval, echo = FALSE, results = "hide"} ## Globally switch off execution of code chunks evalMe <- TRUE haveProt <- FALSE ## evalMe <- .Platform$OS.type == "unix" - ``` -```{r loadlib, message = FALSE, eval = evalMe } +```{r loadlib, message = FALSE, eval = evalMe} library(ensembldb) library(EnsDb.Hsapiens.v86) edb <- EnsDb.Hsapiens.v86 ## Evaluate whether we have protein annotation available hasProteinData(edb) - ``` -```{r restrict9, message = FALSE, echo = FALSE } +```{r restrict9, message = FALSE, echo = FALSE, eval = evalMe} ## silently subsetting to chromosome 11 -edb <- filter(edb, filter = ~ seq_name == "11") +edb <- filter(edb, filter = ~ seq_name == "11") ``` If protein annotation is available, the additional tables and columns are also listed by the `listTables` and `listColumns` methods. -```{r listCols, message = FALSE, eval = evalMe } +```{r listCols, message = FALSE, eval = evalMe} listTables(edb) - ``` In the following sections we show examples how to 1) fetch protein annotations as additional columns to gene/transcript annotations, 2) fetch protein annotation data and 3) map proteins to the genome. -```{r haveprot, echo = FALSE, results = "hide", eval = evalMe } +```{r haveprot, echo = FALSE, results = "hide", eval = evalMe} ## Use this to conditionally disable eval on following chunks haveProt <- hasProteinData(edb) & evalMe - ``` @@ -102,12 +98,11 @@ Protein annotations for (protein coding) transcripts can be retrieved by simply adding the desired annotation columns to the `columns` parameter of the e.g. `genes` or `transcripts` methods. -```{r a_transcripts, eval = haveProt } +```{r a_transcripts, eval = haveProt} ## Get also protein information for ZBTB16 transcripts txs <- transcripts(edb, filter = GeneNameFilter("ZBTB16"), columns = c("protein_id", "uniprot_id", "tx_biotype")) txs - ``` The gene ZBTB16 has protein coding and non-coding transcripts, thus, we get the @@ -115,11 +110,11 @@ protein ID for the coding- and `NA` for the non-coding transcripts. Note also th we have a transcript targeted for nonsense mediated mRNA-decay with a protein ID associated with it, but no Uniprot ID. -```{r a_transcripts_coding_noncoding, eval = haveProt } +```{r a_transcripts_coding_noncoding, eval = haveProt} ## Subset to transcripts with tx_biotype other than protein_coding. txs[txs$tx_biotype != "protein_coding", c("uniprot_id", "tx_biotype", "protein_id")] - + ``` While the mapping from a protein coding transcript to a Ensembl protein ID @@ -130,11 +125,11 @@ each Uniprot ID can be mapped to more than one `protein_id` (and hence fetching Uniprot related additional columns or even protein ID features, as in such cases a redundant list of transcripts is returned. -```{r a_transcripts_coding, eval = haveProt } +```{r a_transcripts_coding, eval = haveProt} ## List the protein IDs and uniprot IDs for the coding transcripts mcols(txs[txs$tx_biotype == "protein_coding", c("tx_id", "protein_id", "uniprot_id")]) - + ``` Some of the n:m mappings for Uniprot IDs can be resolved by restricting either @@ -144,7 +139,7 @@ certain type of mapping method. The corresponding filters are the `uniprot_mapping_type` columns of the `uniprot` database table). In the example below we restrict the result to Uniprot IDs with the mapping type *DIRECT*. -```{r a_transcripts_coding_up, eval = haveProt } +```{r a_transcripts_coding_up, eval = haveProt} ## List all uniprot mapping types in the database. listUniprotMappingTypes(edb) @@ -155,7 +150,7 @@ txs <- transcripts(edb, filter = list(GeneNameFilter("ZBTB16"), UniprotMappingTypeFilter("DIRECT")), columns = c("protein_id", "uniprot_id", "uniprot_db")) mcols(txs) - + ``` For this example the use of the `UniprotMappingTypeFilter` resolved the multiple @@ -174,14 +169,14 @@ protein data to filter the results. In the example below we fetch for example all genes from the database that have a certain protein domain in the protein encoded by any of its transcripts. -```{r a_genes_protdomid_filter, eval = haveProt } -## Get all genes encoded on chromosome 11 which protein contains +```{r a_genes_protdomid_filter, eval = haveProt} +## Get all genes encoded on chromosome 11 which protein contains ## a certain protein domain. gns <- genes(edb, filter = ~ prot_dom_id == "PS50097" & seq_name == "11") length(gns) sort(gns$gene_name) - + ``` So, in total we got 152 genes with that protein domain. In addition to the @@ -196,21 +191,21 @@ The `select`, `keys` and `mapIds` methods from the `AnnotationDbi` package can a used to query `EnsDb` objects for protein annotations. Supported columns and key types are returned by the `columns` and `keytypes` methods. -```{r a_2_annotationdbi, message = FALSE, eval = haveProt } +```{r a_2_annotationdbi, message = FALSE, eval = haveProt} ## Show all columns that are provided by the database columns(edb) ## Show all key types/filters that are supported keytypes(edb) - + ``` Below we fetch all Uniprot IDs annotated to the gene *ZBTB16*. -```{r a_2_select, message = FALSE, eval = haveProt } +```{r a_2_select, message = FALSE, eval = haveProt} select(edb, keys = "ZBTB16", keytype = "GENENAME", columns = "UNIPROTID") - + ``` This returns us all Uniprot IDs of all proteins encoded by the gene's @@ -219,11 +214,11 @@ annotated to a protein, does not have an Uniprot ID assigned (thus `NA` is returned by the above call). As we see below, this transcript is targeted for non sense mediated mRNA decay. -```{r a_2_select_nmd, message = FALSE, eval = haveProt } +```{r a_2_select_nmd, message = FALSE, eval = haveProt} ## Call select, this time providing a GeneNameFilter. select(edb, keys = GeneNameFilter("ZBTB16"), columns = c("TXBIOTYPE", "UNIPROTID", "PROTEINID")) - + ``` Note also that we passed this time a `GeneMameFilter` with the `keys` parameter. @@ -240,12 +235,12 @@ protein annotations becomes available. In the code chunk below we fetch all protein annotations for the gene *ZBTB16*. -```{r b_proteins, message = FALSE, eval = haveProt } +```{r b_proteins, message = FALSE, eval = haveProt} ## Get all proteins and return them as an AAStringSet prts <- proteins(edb, filter = GeneNameFilter("ZBTB16"), return.type = "AAStringSet") prts - + ``` Besides the amino acid sequence, the `prts` contains also additional annotations @@ -253,9 +248,9 @@ that can be accessed with the `mcols` method (metadata columns). All additional columns provided with the parameter `columns` are also added to the `mcols` `DataFrame`. -```{r b_proteins_mcols, message = FALSE, eval = haveProt } +```{r b_proteins_mcols, message = FALSE, eval = haveProt} mcols(prts) - + ``` Note that the `proteins` method will retrieve only gene/transcript annotations of @@ -266,26 +261,26 @@ previous section are not fetched. Querying in addition Uniprot identifiers or protein domain data will result at present in a redundant list of proteins as shown in the code block below. -```{r b_proteins_prot_doms, message = FALSE, eval = haveProt } +```{r b_proteins_prot_doms, message = FALSE, eval = haveProt} ## Get also protein domain annotations in addition to the protein annotations. pd <- proteins(edb, filter = GeneNameFilter("ZBTB16"), columns = c("tx_id", listColumns(edb, "protein_domain")), return.type = "AAStringSet") pd - + ``` The result contains one row/element for each protein domain in each of the proteins. The number of protein domains per protein and the `mcols` are shown below. -```{r b_proteins_prot_doms_2, message = FALSE, eval = haveProt } +```{r b_proteins_prot_doms_2, message = FALSE, eval = haveProt} ## The number of protein domains per protein: table(names(pd)) ## The mcols mcols(pd) - + ``` As we can see each protein can have several protein domains with the start and @@ -302,7 +297,6 @@ functions that allow to map between genomic, transcript and protein coordinates. # Session information -```{r sessionInfo } -sessionInfo() +```{r sessionInfo} +sessionInfo() ``` -