Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docker wrapper #1

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions compute_image_similarities.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
library(optparse)
library(rhdf5)
library(proxy)


option_list = list(
make_option(c("-i", "--input"), action = "store", default = NA, type='character',
help="Path to the HDF5 file with the WND-CHRM features"),
make_option(c("-o", "--output"), action = "store", default = NA, type='character',
help="Basename to use for the output files (PCs and similarity matrix)"),
make_option(c("-n", "--nPCs"), action = "store", default = NA, type = "integer",
help = "Number of principal components to keep and use")
)

opt_parser <- OptionParser(option_list=option_list)
opt = parse_args(opt_parser)

if (is.na(opt$input)){
print_help(opt_parser)
stop("Argument --input required.", call.=FALSE)
}


# Helper function to find elbow in a 2D curve represented by a list of ordered values
# This function finds the point with maximum distance from the line between
# the first and last points.
# Adapted from StackOverflow:
# http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve

find_elbow <- function(values) {

n <- length(values)

# Crude check to see if values are ordered
if (values[1]<=values[n]) {
stop("Values must be in decreasing order")
}

coords <- cbind(seq.int(n), as.matrix(values))
# Vector between first and last point
line <- coords[n,] - coords[1,]
# Normalize the vector
line <- line/sqrt(sum(line^2));
# Vector between all points and first point
V1 <- sweep(coords, 2, coords[1,]) # default function in sweep is - (minus)
# To calculate the distance to the line, we split V1 into two
# components, V2 that is parallel to the line and V3 that is perpendicular.
# The distance is the norm of the part that is perpendicular to the line.
# We find V2 by projecting V1 onto the line by taking the scalar product
# of V1 with the unit vector on the line (this gives us the length of the
# projection of V1 onto the line).
# We get V2 by multiplying this scalar product by the unit vector.
# The perpendicular vector is V1 - V2
V2 <- (V1 %*% line) %*% line
V3 <- V1 - V2
# Distance to line is the norm of V3
dist <- sqrt(rowSums(V3^2))
idx <- which.max(dist)

return(coords[idx,])
}


# Input file with WND_CHRM features
filename <- opt$input
fields <- h5ls(filename)

# Read HDF5 file
group_name <- paste(fields$group, fields$name, sep ="/")
h5data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE)
H5close()

# Get metadata
measures <- h5data$Measurements
imageID <- measures$ImageID
wellID <- measures$WellID

# Features are stored in a list of matrices
listOfFeatureMatrices <- measures[11:length(measures)]

# Make matrix of images x features
featureMatrix <- t(do.call(rbind, listOfFeatureMatrices))
row.names(featureMatrix) <- imageID

# There are multiple rows with the same imageID because of tiling
# We average all rows from the same image
featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean)
rownames(featureMatrix) <- featureMatrix[,1]

nImages <- nrow(featureMatrix) # 388372950 images
randomSetSize <- 0.01*nImages
featureMatrix <- featureMatrix[sample(nrow(featureMatrix), randomSetSize), ]

# Remove constant features
featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))]
rm(fields, group_name, h5data, measures, imageID, wellID, listOfFeatureMatrices)

# PCA
pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE)

# Select PCs
k <- NULL
if (!is.na(opt$nPCs)) {
k <- opt$nPCs
} else {
k <- find_elbow(pca$sdev)[1]
}
saveRDS(pca$x[,1:k], paste0(opt$output,"-PCs.rds"))

# Cosine similarity
S <- as.matrix(simil(pca$x[,1:k], method = "cosine", by_rows = TRUE), diag = 1)
saveRDS(S, paste0(opt$output,"-similarities.rds"))
8 changes: 8 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM r-base
MAINTAINER ome-devel@lists.openmicroscopy.org.uk

ADD r-base-docker.R /
RUN Rscript /r-base-docker.R
RUN useradd -m user -d /scratch
WORKDIR /scratch
ENTRYPOINT ["/usr/bin/Rscript"]
13 changes: 13 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Serrano Remining R Docker

A plain R Docker image for running R scripts in this repository.
Example:

docker build -t serrano-remining-rbase .

docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \
serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5

docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \
serrano-remining-rbase /src/compute_image_similarities.R \
-i /src/example_features.h5 -o /scratch/out
15 changes: 15 additions & 0 deletions docker/r-base-docker.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
install.packages(c(
"ggplot2",
"knitr",
"lsa",
"mclust",
"optparse",
"plotly",
"proxy",
"pryr",
"reshape",
"sysfonts",
"viridis",
"xkcd"))
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
52 changes: 52 additions & 0 deletions test_HDF5.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
args = commandArgs(trailingOnly=TRUE)
if (length(args)!=1) {
stop("One argument required (h5-file)", call.=FALSE)
}

#### 1) Install and load *rhdf5*
if (!("rhdf5" %in% rownames(installed.packages())))
{
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
}

library(rhdf5)

#### 2) Explore the structure
filename <- args[1]
fields <- h5ls(filename)
str(fields)

#### 3) Read *example.h5*
group_name <- paste0(fields$group, fields$name)
data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE)
H5close()

#### 4) Get metadata
measures <- data$Measurements
imageID <- measures$ImageID
wellID <- measures$WellID

#### 5) Get feature values (imageID: `r imageID`; well: `r wellID`)
# Features are stored in a list of matrices
featureListOfMatrices <- measures[11:length(measures)]

## 1) Feature values
# Feature Vector has 2919 values
featureVector <- as.vector(do.call(rbind, featureListOfMatrices))

## 2) Feature names: Build new ID for each feature
# Length of each feature type
feat_size <- lapply(featureListOfMatrices, length)
feat_size <- data.frame(name=names(feat_size), size=unlist(feat_size))
rownames(feat_size) <- seq(1:nrow(feat_size))
# Repeat "length" times the name
a <- rep(feat_size$name, feat_size$size)
# Build sequences of number to create the ids
b <- c()
for(nElem in feat_size$size)
{
b <- c(b, seq(1:nElem))
}

(feature_value <- data.frame(feature=paste(a,b,sep="_"), value=featureVector))