forked from VariantEffect/mavevis
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f78f520
Showing
29 changed files
with
1,977 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
*.csv | ||
*.fasta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
Package: mavevis | ||
Title: Visualization for MaveDB | ||
Version: 0.0.0.9000 | ||
Authors@R: person("Jochen", "Weile", email = "jochenweile@gmail.com", role = c("aut", "cre")) | ||
Description: Query data from MaveDB and visualize as genophenograms with added tracks for structure information. | ||
Depends: R (>= 3.2.3) | ||
License: GPL | ||
Encoding: UTF-8 | ||
LazyData: true | ||
Imports: rapimave, hgvsParseR, yogitools, httr, gdata, hash | ||
Suggests: testthat | ||
RoxygenNote: 6.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(calc.conservation) | ||
export(calc.strucfeats) | ||
export(dashboard) | ||
export(find.pdbs) | ||
export(genophenogram) | ||
export(getUniprotSeq) | ||
export(new.amasLite) | ||
export(new.structure) | ||
export(new.trackdrawer) | ||
export(query.pdb) | ||
export(run.dssp) | ||
export(run.sasa) | ||
export(subcomplex.combos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
|
||
#' Calculate AMAS conservation | ||
#' | ||
#' Uses the AMAS method (Livingstone & Barton 1993) method to calculate position-wise | ||
#' conservation from a protein multiple sequence alignment. Creates an object of type | ||
#' amasLite, which features a single method: \code{run(alignmentFile)}. | ||
#' | ||
#' The \code{run(alignmentFile)} method takes the name of a file as its parameter. | ||
#' The file must be a FASTA formatted multiple sequence alignment. The method returns | ||
#' a numerical vector listing the position-wise conservation with respect to the first | ||
#' entry in the alignment. | ||
#' | ||
#' @return an object of type amasLite | ||
#' @export | ||
new.amasLite <- function() { | ||
|
||
# .props <- read.csv(amasFile) | ||
# colnames(.props)[[ncol(.props)]] <- "-" | ||
|
||
data("amasProps") | ||
|
||
toChars <- function(s) sapply(1:nchar(s),function(i)substr(s,i,i)) | ||
|
||
readFASTA <- function(fastaFile) { | ||
|
||
con <- file(fastaFile,open="r") | ||
|
||
currSeq <- character(0) | ||
|
||
out <- list() | ||
outnames <- character(0) | ||
|
||
while (length(line <- readLines(con,1))>0) { | ||
if (substr(line,1,1)==">") { | ||
#store name | ||
outnames[length(outnames)+1] <- line | ||
#write old sequence | ||
if (length(currSeq) > 0) { | ||
out[[length(out)+1]] <- currSeq | ||
} | ||
#prep for next sequence | ||
currSeq <- character(0) | ||
} else { | ||
#append to current sequence | ||
currSeq <- c(currSeq,toChars(line)) | ||
} | ||
} | ||
#write the last sequence | ||
out[[length(out)+1]] <- currSeq | ||
close(con) | ||
|
||
out <- do.call(rbind,out) | ||
rownames(out) <- outnames | ||
out | ||
} | ||
|
||
run <- function(alignmentFile) { | ||
|
||
alignment <- readFASTA(alignmentFile) | ||
scores <- apply(alignment,2,function(x) { | ||
aas <- unique(x) | ||
sum(apply(amasProps[,aas,drop=FALSE],1,function(p) { | ||
all(p) | !any(p) | ||
})) | ||
}) | ||
|
||
human.pos <- which(alignment[1,] != "-") | ||
scores[human.pos] | ||
} | ||
|
||
|
||
list(run=run) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
options(stringsAsFactors=FALSE) | ||
|
||
library("httr") | ||
suppressMessages(library("gdata")) | ||
# source("lib/libpdb.R") | ||
# source("lib/cliargs.R") | ||
# source("lib/trackdrawer.R") | ||
# source("lib/amasLite.R") | ||
|
||
|
||
|
||
#' Query PDB | ||
#' | ||
#' Queries the Protein Data Bank (PDB) for protein structure data for | ||
#' a given database accession and writes the result to file | ||
#' | ||
#' @param pdb.acc the PDB accession to query | ||
#' @param pdb.file the name of the file to which the output will be written. | ||
#' @return the name of the output file. | ||
#' @examples | ||
#' \dontrun{ | ||
#' #with a pre-determined output file | ||
#' outfile <- "sumo_conjugase_structure.pdb" | ||
#' query.pdb("3UIP",outfile) | ||
#' | ||
#' #with an autogenerated output file | ||
#' outfile <- query.pdb("3UIP") | ||
#' } | ||
#' @export | ||
query.pdb <- function(pdb.acc,pdb.file=paste0(pdb.acc,".pdb")) { | ||
pdb.url <- paste0("https://files.rcsb.org/download/",pdb.acc,".pdb") | ||
htr <- GET(pdb.url) | ||
if (http_status(htr)$category == "Success") { | ||
writeBin(content(htr, "raw"), pdb.file) | ||
return(pdb.file) | ||
} else { | ||
# logger$fatal("Unable to download PDB file") | ||
stop("Unable to download PDB file") | ||
} | ||
} | ||
|
||
#' Run FreeSASA on a PDB structure | ||
#' | ||
#' Calculates solvent accessible surface area on a given structure using the | ||
#' external software FreeSASA, which must be installed and available via $PATH. | ||
#' | ||
#' @param pdb.file The PDB file on which to run FreeSASA | ||
#' @return a \code{data.frame} with the output of FreeSASA | ||
#' @examples | ||
#' \dontrun{ | ||
#' sasa <- run.sasa("3UIP.pdb") | ||
#' } | ||
#' @export | ||
run.sasa <- function(pdb.file) { | ||
widths <- c( | ||
type=3,res=4,chain=2,pos=4,all.abs=9,all.rel=6,side.abs=7,side.rel=6, | ||
main.abs=7,main.rel=6,nonpolar.abs=7,nonpolar.rel=6,polar.abs=7,polar.rel=6 | ||
) | ||
lines <- system(paste("freesasa --format=rsa",pdb.file),intern=TRUE) | ||
lines <- lines[regexpr("^RES",lines) > 0] | ||
con <- textConnection(lines) | ||
sasa.out <- read.fwf(con,widths,strip.white=TRUE) | ||
close(con) | ||
colnames(sasa.out) <- names(widths) | ||
for (i in 4:14) sasa.out[,i] <- as.numeric(sasa.out[,i]) | ||
return(sasa.out) | ||
} | ||
|
||
#' Run DSSP on a PDB structure | ||
#' | ||
#' Calculates secondary structure information from a PDB file using the | ||
#' external software DSSP, which must be installed and available via $PATH | ||
#' | ||
#' @param pdb.file The PDB file on which to run DSSP | ||
#' @return a \code{data.frame} with the output of DSSP | ||
#' @examples | ||
#' \dontrun{ | ||
#' dssp.out <- run.dssp(pdb.file) | ||
#' } | ||
#' @export | ||
run.dssp <- function(pdb.file) { | ||
cols <- c(num=5,residue=5,chain=2,aa=3,struc=3,struc.flags=7, | ||
bp1=5,bp2=4,acc=4,no1=12,on1=11,no2=11,on2=11,tco=8, | ||
kappa=6,alpha=6,phi=6,psi=6,x.ca=7,y.ca=7,z.ca=7) | ||
lines <- system(paste("dssp",pdb.file),intern=TRUE) | ||
lines <- lines[regexpr("^\\s+\\d+\\s+\\d+ \\w{1} \\w{1}",lines) > 0] | ||
con <- textConnection(lines) | ||
dssp.out <- read.fwf( | ||
con, | ||
widths=cols, | ||
stringsAsFactors=FALSE, | ||
strip.white=TRUE | ||
) | ||
close(con) | ||
colnames(dssp.out) <- names(cols) | ||
substr(dssp.out$structure,1,1) | ||
return(dssp.out) | ||
} | ||
|
||
|
||
#' Generate PDB files for complex components | ||
#' | ||
#' This function finds the component chains of a given PDB structure and generates | ||
#' new PDB files for any desired set of combinations of these components | ||
#' | ||
#' @param pdb.file the input PDB file for the complex | ||
#' @param chain.sets a list of character vectors, detailing the combinations of chains to | ||
#' produce. E.g. \code{list("A",c("A","B"),c("A","C"))} produces one file for | ||
#' chain A, one for the combination of chains A and B, and one for the combination | ||
#' of chains A and C. | ||
#' @return a vector of file names corresponding to the elements of \code{chain.sets} | ||
#' @examples | ||
#' \dontrun{ | ||
#' subcplx.files <- sub.complex.combos("3UIP.pdb",chain.sets=list("A",c("A","B"),c("A","C"))) | ||
#' } | ||
#' @export | ||
subcomplex.combos <- function(pdb.file,chain.sets) { | ||
lines <- scan(pdb.file,what="character",sep="\n") | ||
atom.lines <- which(grepl("^ATOM|^TER",lines)) | ||
model.lines <- which(grepl("^MODEL",lines)) | ||
if (length(model.lines) > 0) { | ||
endmdl.lines <- which(grepl("^ENDMDL",lines)) | ||
header.lines <- 1:model.lines[[1]] | ||
footer.lines <- tail(endmdl.lines,1):length(lines) | ||
applicable.atom.lines <- atom.lines[atom.lines > model.lines[[1]] & atom.lines < endmdl.lines[[1]]] | ||
} else { | ||
header.lines <- 1:(atom.lines[[1]]-1) | ||
footer.lines <- (tail(atom.lines,1)+1):length(lines) | ||
applicable.atom.lines <- atom.lines | ||
} | ||
applicable.chains <- substr(lines[applicable.atom.lines],22,22) | ||
subfiles <- sapply(chain.sets,function(cset) { | ||
outfile <- tempfile(fileext=".pdb") | ||
con <- file(outfile,open="w") | ||
writeLines(lines[header.lines],con) | ||
writeLines(lines[applicable.atom.lines[applicable.chains %in% cset]],con) | ||
writeLines(lines[footer.lines],con) | ||
close(con) | ||
return(outfile) | ||
}) | ||
return(subfiles) | ||
} | ||
|
||
|
||
#' Calculate structural features | ||
#' | ||
#' Calculates structural features of a protein from a given PDB entry | ||
#' | ||
#' @param pdb.acc The PDB accession to use, e.g. "3UIP" | ||
#' @param main.chain The chain identifier in the PDB file that corresponds | ||
#' to the protein of interest. Should be a single uppercase letter, e.g. "A". | ||
#' @return a \code{data.frame} detailing secondary structure, solvent accessibility, | ||
#' and burial in interfaces. | ||
#' @examples | ||
#' \dontrun{ | ||
#' sfeats <- calc.strucfeats("3UIP","A") | ||
#' } | ||
#' @export | ||
calc.strucfeats <- function(pdb.acc,main.chain) { | ||
|
||
set_config(config(ssl_verifypeer = 0L)) | ||
|
||
cat("Querying PDB...\n") | ||
#set up cache file for structure | ||
pdb.file <- paste0(pdb.acc,".pdb") | ||
# pdb.file <- tempfile(fileext=".pdb") | ||
|
||
if (!file.exists(pdb.file)) { | ||
query.pdb(pdb.acc,pdb.file) | ||
} | ||
cat("Parsing PDB file...\n") | ||
|
||
cplx.struc <- new.structure(pdb.file) | ||
chains <- cplx.struc$get.chains() | ||
|
||
if (!(main.chain %in% chains)) { | ||
stop("The requested chain ",main.chain," doesn't exist in ",pdb.acc) | ||
} | ||
|
||
chain.info <- cplx.struc$get.chain.info() | ||
# rownames(chain.info) <- chain.info$chain | ||
|
||
chain.names <- sapply(chains,function(ch) with(chain.info,as.character(name[db=="UNP" & chain == ch]))) | ||
chain.names <- sub("_HUMAN","",chain.names) | ||
|
||
|
||
if (length(chains) > 1) { | ||
cat("Splitting protein complex...\n") | ||
|
||
#list of chain pairs | ||
other.chains <- setdiff(chains,main.chain) | ||
chain.sets <- c(main.chain,lapply(other.chains,c,main.chain)) | ||
#Subdivide into chain pairs | ||
# subfiles <- cplx.struc$subcomplex.combos(main.chain) | ||
subfiles <- subcomplex.combos(pdb.file,chain.sets) | ||
|
||
cat("Calculating surface area...\n") | ||
acctables <- mapply(function(cset,subfile) { | ||
#re-order chain ids to original file order | ||
cset <- intersect(chains,cset) | ||
acctable <- run.sasa(subfile) | ||
acctable[acctable$chain==main.chain,] | ||
},cset=chain.sets,subfile=subfiles,SIMPLIFY=FALSE) | ||
|
||
cat("Calculating interface areas...\n") | ||
#calculate burial based on accessibities | ||
burial <- cbind(acctables[[1]],do.call(cbind,lapply(2:length(acctables),function(i) { | ||
before <- acctables[[1]][,"all.abs"] | ||
data.frame( | ||
abs.burial=before-acctables[[i]][,"all.abs"], | ||
rel.burial=(before-acctables[[i]][,"all.abs"])/(before+.000001) | ||
) | ||
}))) | ||
colnames(burial)[seq(15,ncol(burial),2)] <- paste0("abs.burial.",chain.names[other.chains]) | ||
colnames(burial)[seq(16,ncol(burial),2)] <- paste0("rel.burial.",chain.names[other.chains]) | ||
|
||
} else { | ||
acctable <- run.sasa(pdb.file) | ||
burial <- acctable | ||
} | ||
|
||
cat("Calculating secondary structures...\n") | ||
#calculate secondary structure | ||
dssp.out <- run.dssp(pdb.file) | ||
secstruc <- dssp.out[dssp.out$chain==main.chain,c("residue","struc")] | ||
|
||
|
||
cat("Compiling results...\n") | ||
rownames(burial) <- burial$pos | ||
rownames(secstruc) <- secstruc$residue | ||
|
||
allpos <- 1:max(burial$pos) | ||
burial.all <- cbind(burial[as.character(allpos),],secstruc=secstruc[as.character(allpos),"struc"]) | ||
rownames(burial.all) <- burial.all$pos <- allpos | ||
|
||
# cat("Deleting temporary files...\n") | ||
# #clean up temp files | ||
# file.remove(pdb.file) | ||
# file.remove(subfiles) | ||
|
||
return(burial.all) | ||
} |
Oops, something went wrong.