From dee91a4727fdef3cb2f081beef842f855cef9011 Mon Sep 17 00:00:00 2001
From: davidebolo1993 <davidebolognini7@gmail.com>
Date: Wed, 8 Jan 2025 10:22:34 +0100
Subject: [PATCH] switch to impg latest, add rule to output node length

---
 Dockerfile                             |  11 +-
 cosigt_smk/workflow/Snakefile          |   1 +
 cosigt_smk/workflow/envs/impg.yaml     |   2 +-
 cosigt_smk/workflow/rules/gafpack.smk  |   3 +-
 cosigt_smk/workflow/rules/impg.smk     |   1 +
 cosigt_smk/workflow/rules/odgi.smk     |  31 +--
 cosigt_smk/workflow/scripts/cluster2.r |  51 ----
 cosigt_smk/workflow/scripts/filt.go    | 353 -------------------------
 8 files changed, 9 insertions(+), 444 deletions(-)
 delete mode 100644 cosigt_smk/workflow/scripts/cluster2.r
 delete mode 100644 cosigt_smk/workflow/scripts/filt.go

diff --git a/Dockerfile b/Dockerfile
index 592d44a..f020df9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -82,15 +82,6 @@ RUN wget https://github.com/samtools/samtools/releases/download/1.21/samtools-1.
 	&& cd .. \
 	&& rm -rf samtools-1.21
 
-##install strobealign
-RUN git clone https://github.com/ksahlin/strobealign \
-	&& cd strobealign \
-	&& cmake -B build -DCMAKE_C_FLAGS="-msse4.2" -DCMAKE_CXX_FLAGS="-msse4.2" \
-	&& cmake --build build -j 8 \
-	&& cd ..
-
-ENV PATH /opt/strobealign/build:$PATH
-
 ##install bwa-mem
 RUN git clone https://github.com/lh3/bwa.git \
 	&& cd bwa \
@@ -158,7 +149,7 @@ RUN git clone https://github.com/AndreaGuarracino/gfainject \
 ##install impg
 RUN git clone https://github.com/pangenome/impg \
 	&& cd impg \
-	&& git checkout 37b18e18123d92fa5fa824a4e15d8aa7cad3b6db \
+	&& git checkout 15248982d5a873b36e5fe4547b265f9f172cfb54 \
 	&& cargo install --force --path . \
 	&& cp target/release/impg ../impg-tmp \
 	&& cd .. \
diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile
index 6a8ecef..85425b1 100644
--- a/cosigt_smk/workflow/Snakefile
+++ b/cosigt_smk/workflow/Snakefile
@@ -28,6 +28,7 @@ for region in config['region']:
         cosigt_input.append(config['output'] + '/cosigt/'+ sample + '/' + region + '/cosigt_genotype.tsv')
         cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf')
     cosigt_input.append(config['output'] + '/odgi/viz/' + region + '.png')
+    cosigt_input.append(config['output'] + '/odgi/view/' + region + '.node.length.tsv')
     if region in config['annotations']:
         cosigt_input.append(config['output'] + '/odgi/untangle/' + region + '.gggenes.pdf')
 
diff --git a/cosigt_smk/workflow/envs/impg.yaml b/cosigt_smk/workflow/envs/impg.yaml
index d320665..848f13a 100644
--- a/cosigt_smk/workflow/envs/impg.yaml
+++ b/cosigt_smk/workflow/envs/impg.yaml
@@ -2,4 +2,4 @@ channels:
  - conda-forge
  - bioconda
 dependencies:
-  - impg=0.2.2
\ No newline at end of file
+  - impg=0.2.3
\ No newline at end of file
diff --git a/cosigt_smk/workflow/rules/gafpack.smk b/cosigt_smk/workflow/rules/gafpack.smk
index 7102ba3..c787f41 100644
--- a/cosigt_smk/workflow/rules/gafpack.smk
+++ b/cosigt_smk/workflow/rules/gafpack.smk
@@ -23,5 +23,6 @@ rule gafpack_coverage:
 		gafpack \
 		--gfa {input.gfa} \
 		--gaf {input.gaf} \
-		--len-scale --weight-queries | gzip > {output}
+		--len-scale \
+		--weight-queries | gzip > {output}
 		'''
diff --git a/cosigt_smk/workflow/rules/impg.smk b/cosigt_smk/workflow/rules/impg.smk
index b802489..adfa3a5 100644
--- a/cosigt_smk/workflow/rules/impg.smk
+++ b/cosigt_smk/workflow/rules/impg.smk
@@ -23,6 +23,7 @@ rule impg_project:
 	shell:
 		'''
 		impg \
+		query \
 		-p {input.paf} \
 		-b {input.bed} | \
 		grep -v \
diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk
index a97cd03..86a0d22 100644
--- a/cosigt_smk/workflow/rules/odgi.smk
+++ b/cosigt_smk/workflow/rules/odgi.smk
@@ -78,14 +78,14 @@ rule odgi_paths_matrix:
 		cut -f 1,4- | gzip > {output}
 		'''
 
-rule odgi_view_len:
+rule odgi_view_node_length:
 	'''
 	https://github.com/pangenome/odgi
 	'''
 	input:
 		rules.odgi_view.output
 	output:
-		config['output'] + '/odgi/view/{region}.len.tsv'
+		config['output'] + '/odgi/view/{region}.node.length.tsv'
 	threads:
 		1
 	resources:
@@ -96,38 +96,13 @@ rule odgi_view_len:
 	conda:
 		'../envs/odgi.yaml'
 	benchmark:
-		'benchmarks/{region}.odgi_view_len.benchmark.txt'
+		'benchmarks/{region}.odgi_view_node_length.benchmark.txt'
 	shell:
 		'''
 		grep '^S' {input} | \
 		awk '{{print("node."$2,length($3))}}' OFS="\\t" > {output}
 		'''
 
-rule filter_odgi_matrix:
-	'''
-	https://github.com/davidebolo1993/cosigt
-	'''
-	input:
-		coverage=rules.odgi_chop.output,
-		size=rules.odgi_view_len.output
-	output:
-		config['output'] + '/odgi/paths/matrix_flt/{region}.tsv.gz'
-	threads:
-		1
-	resources:
-		mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
-		time=lambda wildcards, attempt: attempt * config['default']['time']
-	container:
-		'docker://davidebolo1993/cosigt_workflow:latest'
-	#conda:
-		#'../envs/odgi.yaml'
-	benchmark:
-		'benchmarks/{region}.filter_odgi_matrix.benchmark.txt'
-	shell:
-		'''
-		flt {input.coverage} {input.size} | gzip > {output}
-		'''
-
 rule odgi_similarity:
 	'''
 	https://github.com/pangenome/odgi
diff --git a/cosigt_smk/workflow/scripts/cluster2.r b/cosigt_smk/workflow/scripts/cluster2.r
deleted file mode 100644
index 02498e4..0000000
--- a/cosigt_smk/workflow/scripts/cluster2.r
+++ /dev/null
@@ -1,51 +0,0 @@
-library(data.table)
-library(dbscan)
-library(rjson)
-library(reshape2)
-library(reshape2)
-library(NbClust)
-
-input_file<-"filt.tsv.gz"
-df<-fread(input_file)
-
-for (d in c("euclidean.dist","jaccard.dist","cosine.dissim","manhattan.dist")) {
-
-    regularMatrix <- acast(df, group.a ~ group.b, value.var = d)
-    distanceMatrix<-as.dist(regularMatrix)
-    pdf(paste0("knn.",d,".pdf"))
-    kNNdistplot(distanceMatrix,k=2)
-    dev.off()
-    kNN_distances <- kNNdist(distanceMatrix, k = 2)
-    sorted_kNN <- sort(kNN_distances)
-    first_derivative <- diff(sorted_kNN)
-    # Step 2: Compute the second derivative
-    second_derivative <- diff(first_derivative)
-    # Step 3: Identify the index with the maximum second derivative
-    optimal_index <- which.max(second_derivative)
-    # Step 4: Retrieve the corresponding `eps` value
-    optimal_eps <- sorted_kNN[optimal_index + 1]  # +1 d
-    db<-dbscan(distanceMatrix,minPts=3, eps=4.3)
-    cl<-db$cluster
-    names(cl)<-labels(distanceMatrix)
-    res.list <- lapply(split(cl, names(cl)), unname)
-    named_res <- lapply(cl, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
-    jout <- toJSON(named_res)
-    # Write JSON output
-    output_file<-paste0("dbscan.",d,".json")
-    write(jout, output_file)
-
-
-    max_cluster <- round(length(unique(df$group.a)) / 5) ##control
-    res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", 
-                distance = NULL, max.nc = max_cluster)$Best.partition
-
-    # Format results
-    res.list <- lapply(split(res, names(res)), unname)
-    named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
-    jout <- toJSON(named_res)
-
-    # Write JSON output
-    output_file<-paste0("agglomerative.",d,".json")
-    write(jout, output_file)
-
-}
\ No newline at end of file
diff --git a/cosigt_smk/workflow/scripts/filt.go b/cosigt_smk/workflow/scripts/filt.go
deleted file mode 100644
index 040b6a4..0000000
--- a/cosigt_smk/workflow/scripts/filt.go
+++ /dev/null
@@ -1,353 +0,0 @@
-package main
-
-import (
-	"bufio"
-	"compress/gzip"
-	"fmt"
-	"io"
-	"log"
-	"math"
-	"os"
-	"sort"
-	"strconv"
-	"strings"
-	"sync"
-)
-
-// Result represents the distance metrics between two groups
-type Result struct {
-	GroupA         string
-	GroupB         string
-	EuclideanDist  float64
-	JaccardDist    float64
-	CosineDissim   float64
-	ManhattanDist  float64
-}
-
-func main() {
-	// Configure logging to write to stderr
-	log.SetOutput(os.Stderr)
-
-	// Validate command-line arguments
-	if len(os.Args) < 3 {
-		log.Fatalf("Usage: %s <input_file.gz> <node_length_file>", os.Args[0])
-	}
-
-	inputFile := os.Args[1]
-	nodeLengthFile := os.Args[2]
-
-	// Read input data
-	df, err := readGzippedTSV(inputFile)
-	if err != nil {
-		log.Fatalf("Error reading input file: %v", err)
-	}
-
-	// Read node lengths
-	nodelengths, err := readNodeLengths(nodeLengthFile)
-	if err != nil {
-		log.Fatalf("Error reading node length file: %v", err)
-	}
-
-	// Filter nodes based on coverage differences
-	filtered := filterNodes(df, nodelengths)
-
-	// Generate group combinations
-	combinations := generateCombinations(filtered["path.name"])
-
-	// Compute distances concurrently
-	results := computeDistances(filtered, combinations)
-
-	// Track printed groups to add self-combinations
-	printedGroups := make(map[string]bool)
-
-	// Print header
-	fmt.Println("group.a\tgroup.b\teuclidean.dist\tjaccard.dist\tcosine.dissim\tmanhattan.dist")
-
-	// Print results with self-combinations
-	for _, res := range results {
-		// Add self-combinations with zero distances
-		for _, group := range []string{res.GroupA, res.GroupB} {
-			if !printedGroups[group] {
-				log.Printf("Adding self-combination for group: %s", group)
-				fmt.Printf("%s\t%s\t0.0000\t0.0000\t0.0000\t0.0000\n", group, group)
-				printedGroups[group] = true
-			}
-		}
-
-		// Print group pair results in both directions
-		fmt.Printf("%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n", res.GroupA, res.GroupB, res.EuclideanDist, res.JaccardDist, res.CosineDissim, res.ManhattanDist)
-		fmt.Printf("%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n", res.GroupB, res.GroupA, res.EuclideanDist, res.JaccardDist, res.CosineDissim, res.ManhattanDist)
-	}
-}
-
-// readGzippedTSV reads a gzipped TSV file into a map of string slices
-func readGzippedTSV(filename string) (map[string][]string, error) {
-	file, err := os.Open(filename)
-	if err != nil {
-		return nil, err
-	}
-	defer file.Close()
-
-	gzReader, err := gzip.NewReader(file)
-	if err != nil {
-		return nil, err
-	}
-	defer gzReader.Close()
-
-	reader := bufio.NewReader(gzReader)
-
-	var headers []string
-	data := make(map[string][]string)
-	lineNum := 0
-
-	for {
-		line, err := reader.ReadString('\n')
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			return nil, err
-		}
-
-		line = strings.TrimSpace(line)
-		columns := strings.Split(line, "\t")
-
-		if lineNum == 0 {
-			// Capture headers
-			headers = columns
-			for _, header := range headers {
-				data[header] = []string{}
-			}
-		} else {
-			// Store data
-			for i, col := range columns {
-				data[headers[i]] = append(data[headers[i]], col)
-			}
-		}
-		lineNum++
-	}
-
-	return data, nil
-}
-
-// readNodeLengths reads node length file and returns a set of valid nodes
-func readNodeLengths(filename string) (map[string]bool, error) {
-	file, err := os.Open(filename)
-	if err != nil {
-		return nil, fmt.Errorf("failed to open node length file: %w", err)
-	}
-	defer file.Close()
-
-	reader := bufio.NewReader(file)
-	data := make(map[string]bool)
-
-	for {
-		line, err := reader.ReadString('\n')
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			return nil, fmt.Errorf("error reading node length line: %w", err)
-		}
-
-		columns := strings.Fields(line)
-		if len(columns) < 2 {
-			log.Printf("Skipping invalid line: %s", line)
-			continue
-		}
-
-		nodeName := columns[0]
-		length, err := strconv.Atoi(columns[1])
-		if err != nil {
-			log.Printf("Invalid length for node %s: %v", nodeName, err)
-			continue
-		}
-
-		if length > 1 {
-			data[nodeName] = true
-			log.Printf("Valid node: %s (length: %d)", nodeName, length)
-		}
-	}
-
-	return data, nil
-}
-
-// filterNodes filters nodes based on differences in coverage and node lengths
-func filterNodes(data map[string][]string, validNodes map[string]bool) map[string][]string {
-	filtered := make(map[string][]string)
-	filtered["path.name"] = data["path.name"]
-
-	for id, values := range data {
-		if id == "path.name" {
-			continue
-		}
-
-		// Check node length validity
-		if _, ok := validNodes[id]; !ok {
-			log.Printf("Discarded node %s: too small", id)
-			continue
-		}
-
-		// Check for differences in the column
-		var diffExists bool
-		for i := 1; i < len(values); i++ {
-			if values[i] != values[i-1] {
-				diffExists = true
-				break
-			}
-		}
-
-		if diffExists {
-			filtered[id] = values
-		} else {
-			log.Printf("Discarded node %s: no difference", id)
-		}
-	}
-
-	return filtered
-}
-
-// generateCombinations creates all unique group pair combinations
-func generateCombinations(groups []string) [][2]string {
-	var combinations [][2]string
-	for i := 0; i < len(groups); i++ {
-		for j := i + 1; j < len(groups); j++ {
-			combinations = append(combinations, [2]string{groups[i], groups[j]})
-		}
-	}
-	return combinations
-}
-
-// computeDistances calculates various distance metrics concurrently
-func computeDistances(data map[string][]string, combinations [][2]string) []Result {
-	results := make([]Result, 0, len(combinations))
-	var mu sync.Mutex
-	var wg sync.WaitGroup
-
-	for _, comb := range combinations {
-		wg.Add(1)
-		go func(pair [2]string) {
-			defer wg.Done()
-
-			groupA, groupB := pair[0], pair[1]
-			h1 := extractValues(data, groupA)
-			h2 := extractValues(data, groupB)
-
-			// Parse values into floats
-			v1 := make([]float64, len(h1))
-			v2 := make([]float64, len(h2))
-			for i := range h1 {
-				v1[i], _ = strconv.ParseFloat(h1[i], 64)
-				v2[i], _ = strconv.ParseFloat(h2[i], 64)
-			}
-
-			// Compute Euclidean distance
-			eucDist := computeEuclideanDistance(v1, v2)
-
-			// Compute Jaccard distance
-			jaccDist := computeJaccardDistance(h1, h2)
-
-			// Compute Cosine Dissimilarity
-			cosineDissim := computeCosineDissimilarity(v1, v2)
-
-			// Compute Manhattan distance
-			manhattanDist := computeManhattanDistance(v1, v2)
-
-			// Add result to the list
-			res := Result{
-				GroupA:         groupA,
-				GroupB:         groupB,
-				EuclideanDist:  eucDist,
-				JaccardDist:    jaccDist,
-				CosineDissim:   cosineDissim,
-				ManhattanDist:  manhattanDist,
-			}
-
-			mu.Lock()
-			results = append(results, res)
-			mu.Unlock()
-		}(comb)
-	}
-
-	wg.Wait()
-	return results
-}
-
-// extractValues retrieves values for a specific group
-func extractValues(data map[string][]string, group string) []string {
-	// Find the row index where "path.name" matches the group
-	rowIndex := -1
-	for i, name := range data["path.name"] {
-		if strings.TrimSpace(name) == strings.TrimSpace(group) {
-			rowIndex = i
-			break
-		}
-	}
-
-	if rowIndex == -1 {
-		log.Fatalf("Group %s not found in path.name", group)
-	}
-
-	// Extract and sort node columns
-	var nodeColumns []string
-	for key := range data {
-		if strings.HasPrefix(key, "node.") {
-			nodeColumns = append(nodeColumns, key)
-		}
-	}
-
-	sort.Slice(nodeColumns, func(i, j int) bool {
-		numI, _ := strconv.Atoi(strings.TrimPrefix(nodeColumns[i], "node."))
-		numJ, _ := strconv.Atoi(strings.TrimPrefix(nodeColumns[j], "node."))
-		return numI < numJ
-	})
-
-	// Extract values for sorted node columns
-	var values []string
-	for _, key := range nodeColumns {
-		values = append(values, data[key][rowIndex])
-	}
-
-	return values
-}
-
-// Computation helper functions
-func computeEuclideanDistance(v1, v2 []float64) float64 {
-	var sumSquared float64
-	for i := range v1 {
-		diff := v1[i] - v2[i]
-		sumSquared += diff * diff
-	}
-	return math.Sqrt(sumSquared)
-}
-
-func computeJaccardDistance(h1, h2 []string) float64 {
-	intersection := 0
-	union := len(h1) + len(h2)
-	for i := range h1 {
-		if h1[i] == h2[i] {
-			intersection++
-		}
-	}
-	return 1.0 - float64(intersection)/float64(union-intersection)
-}
-
-func computeCosineDissimilarity(v1, v2 []float64) float64 {
-	dotProduct, magA, magB := 0.0, 0.0, 0.0
-	for i := range v1 {
-		dotProduct += v1[i] * v2[i]
-		magA += v1[i] * v1[i]
-		magB += v2[i] * v2[i]
-	}
-	magA, magB = math.Sqrt(magA), math.Sqrt(magB)
-	cosineSim := dotProduct / (magA * magB)
-	return 1 - cosineSim
-}
-
-func computeManhattanDistance(v1, v2 []float64) float64 {
-	var sumAbsDiff float64
-	for i := range v1 {
-		sumAbsDiff += math.Abs(v1[i] - v2[i])
-	}
-	return sumAbsDiff
-}
\ No newline at end of file