Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jweile committed Feb 27, 2018
0 parents commit f78f520
Show file tree
Hide file tree
Showing 29 changed files with 1,977 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.csv
*.fasta
12 changes: 12 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Package: mavevis
Title: Visualization for MaveDB
Version: 0.0.0.9000
Authors@R: person("Jochen", "Weile", email = "jochenweile@gmail.com", role = c("aut", "cre"))
Description: Query data from MaveDB and visualize as genophenograms with added tracks for structure information.
Depends: R (>= 3.2.3)
License: GPL
Encoding: UTF-8
LazyData: true
Imports: rapimave, hgvsParseR, yogitools, httr, gdata, hash
Suggests: testthat
RoxygenNote: 6.0.1
15 changes: 15 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by roxygen2: do not edit by hand

export(calc.conservation)
export(calc.strucfeats)
export(dashboard)
export(find.pdbs)
export(genophenogram)
export(getUniprotSeq)
export(new.amasLite)
export(new.structure)
export(new.trackdrawer)
export(query.pdb)
export(run.dssp)
export(run.sasa)
export(subcomplex.combos)
73 changes: 73 additions & 0 deletions R/amasLite.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

#' Calculate AMAS conservation
#'
#' Uses the AMAS method (Livingstone & Barton 1993) method to calculate position-wise
#' conservation from a protein multiple sequence alignment. Creates an object of type
#' amasLite, which features a single method: \code{run(alignmentFile)}.
#'
#' The \code{run(alignmentFile)} method takes the name of a file as its parameter.
#' The file must be a FASTA formatted multiple sequence alignment. The method returns
#' a numerical vector listing the position-wise conservation with respect to the first
#' entry in the alignment.
#'
#' @return an object of type amasLite
#' @export
new.amasLite <- function() {

# .props <- read.csv(amasFile)
# colnames(.props)[[ncol(.props)]] <- "-"

data("amasProps")

toChars <- function(s) sapply(1:nchar(s),function(i)substr(s,i,i))

readFASTA <- function(fastaFile) {

con <- file(fastaFile,open="r")

currSeq <- character(0)

out <- list()
outnames <- character(0)

while (length(line <- readLines(con,1))>0) {
if (substr(line,1,1)==">") {
#store name
outnames[length(outnames)+1] <- line
#write old sequence
if (length(currSeq) > 0) {
out[[length(out)+1]] <- currSeq
}
#prep for next sequence
currSeq <- character(0)
} else {
#append to current sequence
currSeq <- c(currSeq,toChars(line))
}
}
#write the last sequence
out[[length(out)+1]] <- currSeq
close(con)

out <- do.call(rbind,out)
rownames(out) <- outnames
out
}

run <- function(alignmentFile) {

alignment <- readFASTA(alignmentFile)
scores <- apply(alignment,2,function(x) {
aas <- unique(x)
sum(apply(amasProps[,aas,drop=FALSE],1,function(p) {
all(p) | !any(p)
}))
})

human.pos <- which(alignment[1,] != "-")
scores[human.pos]
}


list(run=run)
}
242 changes: 242 additions & 0 deletions R/calcStrucFeats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
options(stringsAsFactors=FALSE)

library("httr")
suppressMessages(library("gdata"))
# source("lib/libpdb.R")
# source("lib/cliargs.R")
# source("lib/trackdrawer.R")
# source("lib/amasLite.R")



#' Query PDB
#'
#' Queries the Protein Data Bank (PDB) for protein structure data for
#' a given database accession and writes the result to file
#'
#' @param pdb.acc the PDB accession to query
#' @param pdb.file the name of the file to which the output will be written.
#' @return the name of the output file.
#' @examples
#' \dontrun{
#' #with a pre-determined output file
#' outfile <- "sumo_conjugase_structure.pdb"
#' query.pdb("3UIP",outfile)
#'
#' #with an autogenerated output file
#' outfile <- query.pdb("3UIP")
#' }
#' @export
query.pdb <- function(pdb.acc,pdb.file=paste0(pdb.acc,".pdb")) {
pdb.url <- paste0("https://files.rcsb.org/download/",pdb.acc,".pdb")
htr <- GET(pdb.url)
if (http_status(htr)$category == "Success") {
writeBin(content(htr, "raw"), pdb.file)
return(pdb.file)
} else {
# logger$fatal("Unable to download PDB file")
stop("Unable to download PDB file")
}
}

#' Run FreeSASA on a PDB structure
#'
#' Calculates solvent accessible surface area on a given structure using the
#' external software FreeSASA, which must be installed and available via $PATH.
#'
#' @param pdb.file The PDB file on which to run FreeSASA
#' @return a \code{data.frame} with the output of FreeSASA
#' @examples
#' \dontrun{
#' sasa <- run.sasa("3UIP.pdb")
#' }
#' @export
run.sasa <- function(pdb.file) {
widths <- c(
type=3,res=4,chain=2,pos=4,all.abs=9,all.rel=6,side.abs=7,side.rel=6,
main.abs=7,main.rel=6,nonpolar.abs=7,nonpolar.rel=6,polar.abs=7,polar.rel=6
)
lines <- system(paste("freesasa --format=rsa",pdb.file),intern=TRUE)
lines <- lines[regexpr("^RES",lines) > 0]
con <- textConnection(lines)
sasa.out <- read.fwf(con,widths,strip.white=TRUE)
close(con)
colnames(sasa.out) <- names(widths)
for (i in 4:14) sasa.out[,i] <- as.numeric(sasa.out[,i])
return(sasa.out)
}

#' Run DSSP on a PDB structure
#'
#' Calculates secondary structure information from a PDB file using the
#' external software DSSP, which must be installed and available via $PATH
#'
#' @param pdb.file The PDB file on which to run DSSP
#' @return a \code{data.frame} with the output of DSSP
#' @examples
#' \dontrun{
#' dssp.out <- run.dssp(pdb.file)
#' }
#' @export
run.dssp <- function(pdb.file) {
cols <- c(num=5,residue=5,chain=2,aa=3,struc=3,struc.flags=7,
bp1=5,bp2=4,acc=4,no1=12,on1=11,no2=11,on2=11,tco=8,
kappa=6,alpha=6,phi=6,psi=6,x.ca=7,y.ca=7,z.ca=7)
lines <- system(paste("dssp",pdb.file),intern=TRUE)
lines <- lines[regexpr("^\\s+\\d+\\s+\\d+ \\w{1} \\w{1}",lines) > 0]
con <- textConnection(lines)
dssp.out <- read.fwf(
con,
widths=cols,
stringsAsFactors=FALSE,
strip.white=TRUE
)
close(con)
colnames(dssp.out) <- names(cols)
substr(dssp.out$structure,1,1)
return(dssp.out)
}


#' Generate PDB files for complex components
#'
#' This function finds the component chains of a given PDB structure and generates
#' new PDB files for any desired set of combinations of these components
#'
#' @param pdb.file the input PDB file for the complex
#' @param chain.sets a list of character vectors, detailing the combinations of chains to
#' produce. E.g. \code{list("A",c("A","B"),c("A","C"))} produces one file for
#' chain A, one for the combination of chains A and B, and one for the combination
#' of chains A and C.
#' @return a vector of file names corresponding to the elements of \code{chain.sets}
#' @examples
#' \dontrun{
#' subcplx.files <- sub.complex.combos("3UIP.pdb",chain.sets=list("A",c("A","B"),c("A","C")))
#' }
#' @export
subcomplex.combos <- function(pdb.file,chain.sets) {
lines <- scan(pdb.file,what="character",sep="\n")
atom.lines <- which(grepl("^ATOM|^TER",lines))
model.lines <- which(grepl("^MODEL",lines))
if (length(model.lines) > 0) {
endmdl.lines <- which(grepl("^ENDMDL",lines))
header.lines <- 1:model.lines[[1]]
footer.lines <- tail(endmdl.lines,1):length(lines)
applicable.atom.lines <- atom.lines[atom.lines > model.lines[[1]] & atom.lines < endmdl.lines[[1]]]
} else {
header.lines <- 1:(atom.lines[[1]]-1)
footer.lines <- (tail(atom.lines,1)+1):length(lines)
applicable.atom.lines <- atom.lines
}
applicable.chains <- substr(lines[applicable.atom.lines],22,22)
subfiles <- sapply(chain.sets,function(cset) {
outfile <- tempfile(fileext=".pdb")
con <- file(outfile,open="w")
writeLines(lines[header.lines],con)
writeLines(lines[applicable.atom.lines[applicable.chains %in% cset]],con)
writeLines(lines[footer.lines],con)
close(con)
return(outfile)
})
return(subfiles)
}


#' Calculate structural features
#'
#' Calculates structural features of a protein from a given PDB entry
#'
#' @param pdb.acc The PDB accession to use, e.g. "3UIP"
#' @param main.chain The chain identifier in the PDB file that corresponds
#' to the protein of interest. Should be a single uppercase letter, e.g. "A".
#' @return a \code{data.frame} detailing secondary structure, solvent accessibility,
#' and burial in interfaces.
#' @examples
#' \dontrun{
#' sfeats <- calc.strucfeats("3UIP","A")
#' }
#' @export
calc.strucfeats <- function(pdb.acc,main.chain) {

set_config(config(ssl_verifypeer = 0L))

cat("Querying PDB...\n")
#set up cache file for structure
pdb.file <- paste0(pdb.acc,".pdb")
# pdb.file <- tempfile(fileext=".pdb")

if (!file.exists(pdb.file)) {
query.pdb(pdb.acc,pdb.file)
}
cat("Parsing PDB file...\n")

cplx.struc <- new.structure(pdb.file)
chains <- cplx.struc$get.chains()

if (!(main.chain %in% chains)) {
stop("The requested chain ",main.chain," doesn't exist in ",pdb.acc)
}

chain.info <- cplx.struc$get.chain.info()
# rownames(chain.info) <- chain.info$chain

chain.names <- sapply(chains,function(ch) with(chain.info,as.character(name[db=="UNP" & chain == ch])))
chain.names <- sub("_HUMAN","",chain.names)


if (length(chains) > 1) {
cat("Splitting protein complex...\n")

#list of chain pairs
other.chains <- setdiff(chains,main.chain)
chain.sets <- c(main.chain,lapply(other.chains,c,main.chain))
#Subdivide into chain pairs
# subfiles <- cplx.struc$subcomplex.combos(main.chain)
subfiles <- subcomplex.combos(pdb.file,chain.sets)

cat("Calculating surface area...\n")
acctables <- mapply(function(cset,subfile) {
#re-order chain ids to original file order
cset <- intersect(chains,cset)
acctable <- run.sasa(subfile)
acctable[acctable$chain==main.chain,]
},cset=chain.sets,subfile=subfiles,SIMPLIFY=FALSE)

cat("Calculating interface areas...\n")
#calculate burial based on accessibities
burial <- cbind(acctables[[1]],do.call(cbind,lapply(2:length(acctables),function(i) {
before <- acctables[[1]][,"all.abs"]
data.frame(
abs.burial=before-acctables[[i]][,"all.abs"],
rel.burial=(before-acctables[[i]][,"all.abs"])/(before+.000001)
)
})))
colnames(burial)[seq(15,ncol(burial),2)] <- paste0("abs.burial.",chain.names[other.chains])
colnames(burial)[seq(16,ncol(burial),2)] <- paste0("rel.burial.",chain.names[other.chains])

} else {
acctable <- run.sasa(pdb.file)
burial <- acctable
}

cat("Calculating secondary structures...\n")
#calculate secondary structure
dssp.out <- run.dssp(pdb.file)
secstruc <- dssp.out[dssp.out$chain==main.chain,c("residue","struc")]


cat("Compiling results...\n")
rownames(burial) <- burial$pos
rownames(secstruc) <- secstruc$residue

allpos <- 1:max(burial$pos)
burial.all <- cbind(burial[as.character(allpos),],secstruc=secstruc[as.character(allpos),"struc"])
rownames(burial.all) <- burial.all$pos <- allpos

# cat("Deleting temporary files...\n")
# #clean up temp files
# file.remove(pdb.file)
# file.remove(subfiles)

return(burial.all)
}
Loading

0 comments on commit f78f520

Please sign in to comment.