Skip to content

Commit

Permalink
add strobealgin to Dockerfile for further testing
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Dec 19, 2024
1 parent 03a33c4 commit dcac331
Show file tree
Hide file tree
Showing 5 changed files with 466 additions and 2 deletions.
12 changes: 11 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ RUN apt-get -y install \
python3-dev \
python3-pip \
libjemalloc-dev \
libisal-dev \
cmake \
make \
g++ \
Expand Down Expand Up @@ -81,6 +82,15 @@ RUN wget https://github.com/samtools/samtools/releases/download/1.21/samtools-1.
&& cd .. \
&& rm -rf samtools-1.21

##install strobealign
RUN git clone https://github.com/ksahlin/strobealign \
&& cd strobealign \
&& cmake -B build -DCMAKE_C_FLAGS="-msse4.2" -DCMAKE_CXX_FLAGS="-msse4.2" \
&& cmake --build build -j 8 \
&& cd ..

ENV PATH /opt/strobealign/build:$PATH

##install bwa-mem
RUN git clone https://github.com/lh3/bwa.git \
&& cd bwa \
Expand Down Expand Up @@ -187,4 +197,4 @@ RUN conda create -y -n renv -c conda-forge -c bioconda \
bioconductor-rtracklayer=1.62.0 \
r-randomcolor=1.1.0.1
RUN echo "source activate renv" > ~/.bashrc
ENV PATH /miniconda/envs/renv/bin:$PATH
ENV PATH /miniconda/envs/renv/bin:$PATH
50 changes: 50 additions & 0 deletions cosigt_smk/workflow/rules/odgi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,56 @@ rule odgi_paths_matrix:
cut -f 1,4- | gzip > {output}
'''

rule odgi_view_len:
'''
https://github.com/pangenome/odgi
'''
input:
rules.odgi_view.output
output:
config['output'] + '/odgi/view/{region}.len.tsv'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1726671973'
conda:
'../envs/odgi.yaml'
benchmark:
'benchmarks/{region}.odgi_view_len.benchmark.txt'
shell:
'''
grep '^S' {input} | \
awk '{{print("node."$2,length($3))}}' OFS="\\t" > {output}
'''

rule filter_odgi_matrix:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
coverage=rules.odgi_chop.output,
size=rules.odgi_view_len.output
output:
config['output'] + '/odgi/paths/matrix_flt/{region}.tsv.gz'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/cosigt_workflow:latest'
#conda:
#'../envs/odgi.yaml'
benchmark:
'benchmarks/{region}.filter_odgi_matrix.benchmark.txt'
shell:
'''
flt {input.coverage} {input.size} | gzip > {output}
'''

rule odgi_similarity:
'''
https://github.com/pangenome/odgi
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/workflow/scripts/cluster.r
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ regularMatrix[is.na(regularMatrix)]<-1
distanceMatrix <- as.dist(regularMatrix)

# Calculate silhouette score and best partition
max_cluster <- round(length(unique(df$group.a)) / 3) ##control
max_cluster <- round(length(unique(df$group.a)) / 5) ##control
res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette",
distance = NULL, max.nc = max_cluster)$Best.partition

Expand Down
51 changes: 51 additions & 0 deletions cosigt_smk/workflow/scripts/cluster2.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
library(data.table)
library(dbscan)
library(rjson)
library(reshape2)
library(reshape2)
library(NbClust)

input_file<-"filt.tsv.gz"
df<-fread(input_file)

for (d in c("euclidean.dist","jaccard.dist","cosine.dissim","manhattan.dist")) {

regularMatrix <- acast(df, group.a ~ group.b, value.var = d)
distanceMatrix<-as.dist(regularMatrix)
pdf(paste0("knn.",d,".pdf"))
kNNdistplot(distanceMatrix,k=2)
dev.off()
kNN_distances <- kNNdist(distanceMatrix, k = 2)
sorted_kNN <- sort(kNN_distances)
first_derivative <- diff(sorted_kNN)
# Step 2: Compute the second derivative
second_derivative <- diff(first_derivative)
# Step 3: Identify the index with the maximum second derivative
optimal_index <- which.max(second_derivative)
# Step 4: Retrieve the corresponding `eps` value
optimal_eps <- sorted_kNN[optimal_index + 1] # +1 d
db<-dbscan(distanceMatrix,minPts=3, eps=4.3)
cl<-db$cluster
names(cl)<-labels(distanceMatrix)
res.list <- lapply(split(cl, names(cl)), unname)
named_res <- lapply(cl, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
jout <- toJSON(named_res)
# Write JSON output
output_file<-paste0("dbscan.",d,".json")
write(jout, output_file)


max_cluster <- round(length(unique(df$group.a)) / 5) ##control
res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette",
distance = NULL, max.nc = max_cluster)$Best.partition

# Format results
res.list <- lapply(split(res, names(res)), unname)
named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
jout <- toJSON(named_res)

# Write JSON output
output_file<-paste0("agglomerative.",d,".json")
write(jout, output_file)

}
Loading

0 comments on commit dcac331

Please sign in to comment.