initial commit

danieljrichard · Feb 27, 2018 · f78f520 · f78f520
commit f78f520
Show file tree

Hide file tree

Showing 29 changed files with 1,977 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.fasta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: mavevis
+Title: Visualization for MaveDB
+Version: 0.0.0.9000
+Authors@R: person("Jochen", "Weile", email = "jochenweile@gmail.com", role = c("aut", "cre"))
+Description: Query data from MaveDB and visualize as genophenograms with added tracks for structure information.
+Depends: R (>= 3.2.3)
+License: GPL
+Encoding: UTF-8
+LazyData: true
+Imports: rapimave, hgvsParseR, yogitools, httr, gdata, hash
+Suggests: testthat
+RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,15 @@
+# Generated by roxygen2: do not edit by hand
+
+export(calc.conservation)
+export(calc.strucfeats)
+export(dashboard)
+export(find.pdbs)
+export(genophenogram)
+export(getUniprotSeq)
+export(new.amasLite)
+export(new.structure)
+export(new.trackdrawer)
+export(query.pdb)
+export(run.dssp)
+export(run.sasa)
+export(subcomplex.combos)
diff --git a/R/amasLite.R b/R/amasLite.R
@@ -0,0 +1,73 @@
+
+#' Calculate AMAS conservation
+#' 
+#' Uses the AMAS method (Livingstone & Barton 1993) method to calculate position-wise
+#' conservation from a protein multiple sequence alignment. Creates an object of type
+#' amasLite, which features a single method: \code{run(alignmentFile)}.
+#' 
+#' The \code{run(alignmentFile)} method takes the name of a file as its parameter. 
+#' The file must be a FASTA formatted multiple sequence alignment. The method returns
+#' a numerical vector listing the position-wise conservation with respect to the first
+#' entry in the alignment.
+#' 
+#' @return an object of type amasLite
+#' @export
+new.amasLite <- function() {
+
+	# .props <- read.csv(amasFile)
+	# colnames(.props)[[ncol(.props)]] <- "-"
+
+	data("amasProps")
+
+	toChars <- function(s) sapply(1:nchar(s),function(i)substr(s,i,i))
+
+	readFASTA <- function(fastaFile) {
+
+		con <- file(fastaFile,open="r")
+
+		currSeq <- character(0)
+
+		out <- list()
+		outnames <- character(0)
+
+		while (length(line <- readLines(con,1))>0) {
+			if (substr(line,1,1)==">") {
+				#store name
+				outnames[length(outnames)+1] <- line
+				#write old sequence
+				if (length(currSeq) > 0) {
+					out[[length(out)+1]] <- currSeq
+				}
+				#prep for next sequence
+				currSeq <- character(0)
+			} else {
+				#append to current sequence
+				currSeq <- c(currSeq,toChars(line))
+			}
+		}
+		#write the last sequence
+		out[[length(out)+1]] <- currSeq
+		close(con)
+
+		out <- do.call(rbind,out)
+		rownames(out) <- outnames
+		out
+	}
+
+	run <- function(alignmentFile) {
+
+		alignment <- readFASTA(alignmentFile)
+		scores <- apply(alignment,2,function(x) {
+			aas <- unique(x)
+			sum(apply(amasProps[,aas,drop=FALSE],1,function(p) {
+				all(p) | !any(p)
+			}))
+		})
+
+		human.pos <- which(alignment[1,] != "-")
+		scores[human.pos]
+	}
+
+
+	list(run=run)
+}
diff --git a/R/calcStrucFeats.R b/R/calcStrucFeats.R
@@ -0,0 +1,242 @@
+options(stringsAsFactors=FALSE)
+
+library("httr")
+suppressMessages(library("gdata"))
+# source("lib/libpdb.R")
+# source("lib/cliargs.R")
+# source("lib/trackdrawer.R")
+# source("lib/amasLite.R")
+
+
+
+#' Query PDB
+#' 
+#' Queries the Protein Data Bank (PDB) for protein structure data for
+#' a given database accession and writes the result to file
+#' 
+#' @param pdb.acc the PDB accession to query
+#' @param pdb.file the name of the file to which the output will be written.
+#' @return the name of the output file.
+#' @examples
+#' \dontrun{
+#' #with a pre-determined output file
+#' outfile <- "sumo_conjugase_structure.pdb"
+#' query.pdb("3UIP",outfile)
+#' 
+#' #with an autogenerated output file
+#' outfile <- query.pdb("3UIP")
+#' }
+#' @export
+query.pdb <- function(pdb.acc,pdb.file=paste0(pdb.acc,".pdb")) {
+	pdb.url <- paste0("https://files.rcsb.org/download/",pdb.acc,".pdb")
+	htr <- GET(pdb.url)
+	if (http_status(htr)$category == "Success") {
+		writeBin(content(htr, "raw"), pdb.file)
+		return(pdb.file)
+	} else {
+		# logger$fatal("Unable to download PDB file")
+		stop("Unable to download PDB file")
+	}
+}
+
+#' Run FreeSASA on a PDB structure
+#' 
+#' Calculates solvent accessible surface area on a given structure using the 
+#' external software FreeSASA, which must be installed and available via $PATH.
+#' 
+#' @param pdb.file The PDB file on which to run FreeSASA
+#' @return a \code{data.frame} with the output of FreeSASA
+#' @examples
+#' \dontrun{
+#' sasa <- run.sasa("3UIP.pdb")
+#' }
+#' @export
+run.sasa <- function(pdb.file) {
+	widths <- c(
+		type=3,res=4,chain=2,pos=4,all.abs=9,all.rel=6,side.abs=7,side.rel=6,
+		main.abs=7,main.rel=6,nonpolar.abs=7,nonpolar.rel=6,polar.abs=7,polar.rel=6
+	)
+	lines <- system(paste("freesasa --format=rsa",pdb.file),intern=TRUE)
+	lines <- lines[regexpr("^RES",lines) > 0]
+	con <- textConnection(lines)
+	sasa.out <- read.fwf(con,widths,strip.white=TRUE)
+	close(con)
+	colnames(sasa.out) <- names(widths)
+	for (i in 4:14) sasa.out[,i] <- as.numeric(sasa.out[,i])
+	return(sasa.out)
+}
+
+#' Run DSSP on a PDB structure
+#' 
+#' Calculates secondary structure information from a PDB file using the
+#' external software DSSP, which must be installed and available via $PATH
+#' 
+#' @param pdb.file The PDB file on which to run DSSP
+#' @return a \code{data.frame} with the output of DSSP
+#' @examples
+#' \dontrun{
+#' dssp.out <- run.dssp(pdb.file)
+#' }
+#' @export
+run.dssp <- function(pdb.file) {
+	cols <- c(num=5,residue=5,chain=2,aa=3,struc=3,struc.flags=7,
+		bp1=5,bp2=4,acc=4,no1=12,on1=11,no2=11,on2=11,tco=8,
+		kappa=6,alpha=6,phi=6,psi=6,x.ca=7,y.ca=7,z.ca=7)
+	lines <- system(paste("dssp",pdb.file),intern=TRUE)
+	lines <- lines[regexpr("^\\s+\\d+\\s+\\d+ \\w{1} \\w{1}",lines) > 0]
+	con <- textConnection(lines)	
+	dssp.out <- read.fwf(
+		con,
+		widths=cols,
+		stringsAsFactors=FALSE,
+		strip.white=TRUE
+	)
+	close(con)
+	colnames(dssp.out) <- names(cols)
+	substr(dssp.out$structure,1,1)
+	return(dssp.out)
+}
+
+
+#' Generate PDB files for complex components
+#' 
+#' This function finds the component chains of a given PDB structure and generates
+#' new PDB files for any desired set of combinations of these components
+#' 
+#' @param pdb.file the input PDB file for the complex
+#' @param chain.sets a list of character vectors, detailing the combinations of chains to 
+#'     produce. E.g. \code{list("A",c("A","B"),c("A","C"))} produces one file for 
+#'     chain A, one for the combination of chains A and B, and one for the combination
+#'     of chains A and C.
+#' @return a vector of file names corresponding to the elements of \code{chain.sets}
+#' @examples
+#' \dontrun{
+#' subcplx.files <- sub.complex.combos("3UIP.pdb",chain.sets=list("A",c("A","B"),c("A","C")))
+#' }
+#' @export
+subcomplex.combos <- function(pdb.file,chain.sets) {
+	lines <- scan(pdb.file,what="character",sep="\n")
+	atom.lines <- which(grepl("^ATOM|^TER",lines))
+	model.lines <- which(grepl("^MODEL",lines))
+	if (length(model.lines) > 0) {
+		endmdl.lines <- which(grepl("^ENDMDL",lines))
+		header.lines <- 1:model.lines[[1]]
+		footer.lines <- tail(endmdl.lines,1):length(lines)
+		applicable.atom.lines <- atom.lines[atom.lines > model.lines[[1]] & atom.lines < endmdl.lines[[1]]]		
+	} else {
+		header.lines <- 1:(atom.lines[[1]]-1)
+		footer.lines <- (tail(atom.lines,1)+1):length(lines)
+		applicable.atom.lines <- atom.lines
+	}
+	applicable.chains <- substr(lines[applicable.atom.lines],22,22)
+	subfiles <- sapply(chain.sets,function(cset) {
+		outfile <- tempfile(fileext=".pdb")
+		con <- file(outfile,open="w")
+		writeLines(lines[header.lines],con)
+		writeLines(lines[applicable.atom.lines[applicable.chains %in% cset]],con)
+		writeLines(lines[footer.lines],con)
+		close(con)
+		return(outfile)
+	})
+	return(subfiles)
+}
+
+
+#' Calculate structural features
+#' 
+#' Calculates structural features of a protein from a given PDB entry
+#' 
+#' @param pdb.acc The PDB accession to use, e.g. "3UIP"
+#' @param main.chain The chain identifier in the PDB file that corresponds 
+#'    to the protein of interest. Should be a single uppercase letter, e.g. "A".
+#' @return a \code{data.frame} detailing secondary structure, solvent accessibility,
+#'    and burial in interfaces.
+#' @examples
+#' \dontrun{
+#' sfeats <- calc.strucfeats("3UIP","A")
+#' }
+#' @export
+calc.strucfeats <- function(pdb.acc,main.chain) {
+
+	set_config(config(ssl_verifypeer = 0L))
+
+	cat("Querying PDB...\n")
+	#set up cache file for structure
+	pdb.file <- paste0(pdb.acc,".pdb")
+	# pdb.file <- tempfile(fileext=".pdb")
+
+	if (!file.exists(pdb.file)) {
+		query.pdb(pdb.acc,pdb.file)
+	}
+	cat("Parsing PDB file...\n")
+
+	cplx.struc <- new.structure(pdb.file)
+	chains <- cplx.struc$get.chains()
+
+	if (!(main.chain %in% chains)) {
+		stop("The requested chain ",main.chain," doesn't exist in ",pdb.acc)
+	}
+
+	chain.info <- cplx.struc$get.chain.info()
+	# rownames(chain.info) <- chain.info$chain
+
+	chain.names <- sapply(chains,function(ch) with(chain.info,as.character(name[db=="UNP" & chain == ch])))
+	chain.names <- sub("_HUMAN","",chain.names)
+
+
+	if (length(chains) > 1) {
+		cat("Splitting protein complex...\n")
+
+		#list of chain pairs
+		other.chains <- setdiff(chains,main.chain)
+		chain.sets <- c(main.chain,lapply(other.chains,c,main.chain))
+		#Subdivide into chain pairs
+		# subfiles <- cplx.struc$subcomplex.combos(main.chain)
+		subfiles <- subcomplex.combos(pdb.file,chain.sets)
+
+		cat("Calculating surface area...\n")
+		acctables <- mapply(function(cset,subfile) {
+			#re-order chain ids to original file order
+			cset <- intersect(chains,cset)
+			acctable <- run.sasa(subfile)
+			acctable[acctable$chain==main.chain,]
+		},cset=chain.sets,subfile=subfiles,SIMPLIFY=FALSE)
+
+		cat("Calculating interface areas...\n")
+		#calculate burial based on accessibities
+		burial <- cbind(acctables[[1]],do.call(cbind,lapply(2:length(acctables),function(i) {
+			before <- acctables[[1]][,"all.abs"]
+			data.frame(
+				abs.burial=before-acctables[[i]][,"all.abs"],
+				rel.burial=(before-acctables[[i]][,"all.abs"])/(before+.000001)
+			)
+		})))
+		colnames(burial)[seq(15,ncol(burial),2)] <- paste0("abs.burial.",chain.names[other.chains])
+		colnames(burial)[seq(16,ncol(burial),2)] <- paste0("rel.burial.",chain.names[other.chains])
+
+	} else {
+		acctable <- run.sasa(pdb.file)
+		burial <- acctable
+	}
+
+	cat("Calculating secondary structures...\n")
+	#calculate secondary structure
+	dssp.out <- run.dssp(pdb.file)
+	secstruc <- dssp.out[dssp.out$chain==main.chain,c("residue","struc")]
+
+
+	cat("Compiling results...\n")
+	rownames(burial) <- burial$pos
+	rownames(secstruc) <- secstruc$residue
+
+	allpos <- 1:max(burial$pos)
+	burial.all <- cbind(burial[as.character(allpos),],secstruc=secstruc[as.character(allpos),"struc"])
+	rownames(burial.all) <- burial.all$pos <- allpos
+
+	# cat("Deleting temporary files...\n")
+	# #clean up temp files
+	# file.remove(pdb.file)
+	# file.remove(subfiles)
+
+	return(burial.all)
+}