From dee91a4727fdef3cb2f081beef842f855cef9011 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 8 Jan 2025 10:22:34 +0100 Subject: [PATCH] switch to impg latest, add rule to output node length --- Dockerfile | 11 +- cosigt_smk/workflow/Snakefile | 1 + cosigt_smk/workflow/envs/impg.yaml | 2 +- cosigt_smk/workflow/rules/gafpack.smk | 3 +- cosigt_smk/workflow/rules/impg.smk | 1 + cosigt_smk/workflow/rules/odgi.smk | 31 +-- cosigt_smk/workflow/scripts/cluster2.r | 51 ---- cosigt_smk/workflow/scripts/filt.go | 353 ------------------------- 8 files changed, 9 insertions(+), 444 deletions(-) delete mode 100644 cosigt_smk/workflow/scripts/cluster2.r delete mode 100644 cosigt_smk/workflow/scripts/filt.go diff --git a/Dockerfile b/Dockerfile index 592d44a..f020df9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -82,15 +82,6 @@ RUN wget https://github.com/samtools/samtools/releases/download/1.21/samtools-1. && cd .. \ && rm -rf samtools-1.21 -##install strobealign -RUN git clone https://github.com/ksahlin/strobealign \ - && cd strobealign \ - && cmake -B build -DCMAKE_C_FLAGS="-msse4.2" -DCMAKE_CXX_FLAGS="-msse4.2" \ - && cmake --build build -j 8 \ - && cd .. - -ENV PATH /opt/strobealign/build:$PATH - ##install bwa-mem RUN git clone https://github.com/lh3/bwa.git \ && cd bwa \ @@ -158,7 +149,7 @@ RUN git clone https://github.com/AndreaGuarracino/gfainject \ ##install impg RUN git clone https://github.com/pangenome/impg \ && cd impg \ - && git checkout 37b18e18123d92fa5fa824a4e15d8aa7cad3b6db \ + && git checkout 15248982d5a873b36e5fe4547b265f9f172cfb54 \ && cargo install --force --path . \ && cp target/release/impg ../impg-tmp \ && cd .. \ diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile index 6a8ecef..85425b1 100644 --- a/cosigt_smk/workflow/Snakefile +++ b/cosigt_smk/workflow/Snakefile @@ -28,6 +28,7 @@ for region in config['region']: cosigt_input.append(config['output'] + '/cosigt/'+ sample + '/' + region + '/cosigt_genotype.tsv') cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf') cosigt_input.append(config['output'] + '/odgi/viz/' + region + '.png') + cosigt_input.append(config['output'] + '/odgi/view/' + region + '.node.length.tsv') if region in config['annotations']: cosigt_input.append(config['output'] + '/odgi/untangle/' + region + '.gggenes.pdf') diff --git a/cosigt_smk/workflow/envs/impg.yaml b/cosigt_smk/workflow/envs/impg.yaml index d320665..848f13a 100644 --- a/cosigt_smk/workflow/envs/impg.yaml +++ b/cosigt_smk/workflow/envs/impg.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - impg=0.2.2 \ No newline at end of file + - impg=0.2.3 \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/gafpack.smk b/cosigt_smk/workflow/rules/gafpack.smk index 7102ba3..c787f41 100644 --- a/cosigt_smk/workflow/rules/gafpack.smk +++ b/cosigt_smk/workflow/rules/gafpack.smk @@ -23,5 +23,6 @@ rule gafpack_coverage: gafpack \ --gfa {input.gfa} \ --gaf {input.gaf} \ - --len-scale --weight-queries | gzip > {output} + --len-scale \ + --weight-queries | gzip > {output} ''' diff --git a/cosigt_smk/workflow/rules/impg.smk b/cosigt_smk/workflow/rules/impg.smk index b802489..adfa3a5 100644 --- a/cosigt_smk/workflow/rules/impg.smk +++ b/cosigt_smk/workflow/rules/impg.smk @@ -23,6 +23,7 @@ rule impg_project: shell: ''' impg \ + query \ -p {input.paf} \ -b {input.bed} | \ grep -v \ diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index a97cd03..86a0d22 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -78,14 +78,14 @@ rule odgi_paths_matrix: cut -f 1,4- | gzip > {output} ''' -rule odgi_view_len: +rule odgi_view_node_length: ''' https://github.com/pangenome/odgi ''' input: rules.odgi_view.output output: - config['output'] + '/odgi/view/{region}.len.tsv' + config['output'] + '/odgi/view/{region}.node.length.tsv' threads: 1 resources: @@ -96,38 +96,13 @@ rule odgi_view_len: conda: '../envs/odgi.yaml' benchmark: - 'benchmarks/{region}.odgi_view_len.benchmark.txt' + 'benchmarks/{region}.odgi_view_node_length.benchmark.txt' shell: ''' grep '^S' {input} | \ awk '{{print("node."$2,length($3))}}' OFS="\\t" > {output} ''' -rule filter_odgi_matrix: - ''' - https://github.com/davidebolo1993/cosigt - ''' - input: - coverage=rules.odgi_chop.output, - size=rules.odgi_view_len.output - output: - config['output'] + '/odgi/paths/matrix_flt/{region}.tsv.gz' - threads: - 1 - resources: - mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['default']['time'] - container: - 'docker://davidebolo1993/cosigt_workflow:latest' - #conda: - #'../envs/odgi.yaml' - benchmark: - 'benchmarks/{region}.filter_odgi_matrix.benchmark.txt' - shell: - ''' - flt {input.coverage} {input.size} | gzip > {output} - ''' - rule odgi_similarity: ''' https://github.com/pangenome/odgi diff --git a/cosigt_smk/workflow/scripts/cluster2.r b/cosigt_smk/workflow/scripts/cluster2.r deleted file mode 100644 index 02498e4..0000000 --- a/cosigt_smk/workflow/scripts/cluster2.r +++ /dev/null @@ -1,51 +0,0 @@ -library(data.table) -library(dbscan) -library(rjson) -library(reshape2) -library(reshape2) -library(NbClust) - -input_file<-"filt.tsv.gz" -df<-fread(input_file) - -for (d in c("euclidean.dist","jaccard.dist","cosine.dissim","manhattan.dist")) { - - regularMatrix <- acast(df, group.a ~ group.b, value.var = d) - distanceMatrix<-as.dist(regularMatrix) - pdf(paste0("knn.",d,".pdf")) - kNNdistplot(distanceMatrix,k=2) - dev.off() - kNN_distances <- kNNdist(distanceMatrix, k = 2) - sorted_kNN <- sort(kNN_distances) - first_derivative <- diff(sorted_kNN) - # Step 2: Compute the second derivative - second_derivative <- diff(first_derivative) - # Step 3: Identify the index with the maximum second derivative - optimal_index <- which.max(second_derivative) - # Step 4: Retrieve the corresponding `eps` value - optimal_eps <- sorted_kNN[optimal_index + 1] # +1 d - db<-dbscan(distanceMatrix,minPts=3, eps=4.3) - cl<-db$cluster - names(cl)<-labels(distanceMatrix) - res.list <- lapply(split(cl, names(cl)), unname) - named_res <- lapply(cl, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") - jout <- toJSON(named_res) - # Write JSON output - output_file<-paste0("dbscan.",d,".json") - write(jout, output_file) - - - max_cluster <- round(length(unique(df$group.a)) / 5) ##control - res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", - distance = NULL, max.nc = max_cluster)$Best.partition - - # Format results - res.list <- lapply(split(res, names(res)), unname) - named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") - jout <- toJSON(named_res) - - # Write JSON output - output_file<-paste0("agglomerative.",d,".json") - write(jout, output_file) - -} \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/filt.go b/cosigt_smk/workflow/scripts/filt.go deleted file mode 100644 index 040b6a4..0000000 --- a/cosigt_smk/workflow/scripts/filt.go +++ /dev/null @@ -1,353 +0,0 @@ -package main - -import ( - "bufio" - "compress/gzip" - "fmt" - "io" - "log" - "math" - "os" - "sort" - "strconv" - "strings" - "sync" -) - -// Result represents the distance metrics between two groups -type Result struct { - GroupA string - GroupB string - EuclideanDist float64 - JaccardDist float64 - CosineDissim float64 - ManhattanDist float64 -} - -func main() { - // Configure logging to write to stderr - log.SetOutput(os.Stderr) - - // Validate command-line arguments - if len(os.Args) < 3 { - log.Fatalf("Usage: %s ", os.Args[0]) - } - - inputFile := os.Args[1] - nodeLengthFile := os.Args[2] - - // Read input data - df, err := readGzippedTSV(inputFile) - if err != nil { - log.Fatalf("Error reading input file: %v", err) - } - - // Read node lengths - nodelengths, err := readNodeLengths(nodeLengthFile) - if err != nil { - log.Fatalf("Error reading node length file: %v", err) - } - - // Filter nodes based on coverage differences - filtered := filterNodes(df, nodelengths) - - // Generate group combinations - combinations := generateCombinations(filtered["path.name"]) - - // Compute distances concurrently - results := computeDistances(filtered, combinations) - - // Track printed groups to add self-combinations - printedGroups := make(map[string]bool) - - // Print header - fmt.Println("group.a\tgroup.b\teuclidean.dist\tjaccard.dist\tcosine.dissim\tmanhattan.dist") - - // Print results with self-combinations - for _, res := range results { - // Add self-combinations with zero distances - for _, group := range []string{res.GroupA, res.GroupB} { - if !printedGroups[group] { - log.Printf("Adding self-combination for group: %s", group) - fmt.Printf("%s\t%s\t0.0000\t0.0000\t0.0000\t0.0000\n", group, group) - printedGroups[group] = true - } - } - - // Print group pair results in both directions - fmt.Printf("%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n", res.GroupA, res.GroupB, res.EuclideanDist, res.JaccardDist, res.CosineDissim, res.ManhattanDist) - fmt.Printf("%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n", res.GroupB, res.GroupA, res.EuclideanDist, res.JaccardDist, res.CosineDissim, res.ManhattanDist) - } -} - -// readGzippedTSV reads a gzipped TSV file into a map of string slices -func readGzippedTSV(filename string) (map[string][]string, error) { - file, err := os.Open(filename) - if err != nil { - return nil, err - } - defer file.Close() - - gzReader, err := gzip.NewReader(file) - if err != nil { - return nil, err - } - defer gzReader.Close() - - reader := bufio.NewReader(gzReader) - - var headers []string - data := make(map[string][]string) - lineNum := 0 - - for { - line, err := reader.ReadString('\n') - if err != nil { - if err == io.EOF { - break - } - return nil, err - } - - line = strings.TrimSpace(line) - columns := strings.Split(line, "\t") - - if lineNum == 0 { - // Capture headers - headers = columns - for _, header := range headers { - data[header] = []string{} - } - } else { - // Store data - for i, col := range columns { - data[headers[i]] = append(data[headers[i]], col) - } - } - lineNum++ - } - - return data, nil -} - -// readNodeLengths reads node length file and returns a set of valid nodes -func readNodeLengths(filename string) (map[string]bool, error) { - file, err := os.Open(filename) - if err != nil { - return nil, fmt.Errorf("failed to open node length file: %w", err) - } - defer file.Close() - - reader := bufio.NewReader(file) - data := make(map[string]bool) - - for { - line, err := reader.ReadString('\n') - if err != nil { - if err == io.EOF { - break - } - return nil, fmt.Errorf("error reading node length line: %w", err) - } - - columns := strings.Fields(line) - if len(columns) < 2 { - log.Printf("Skipping invalid line: %s", line) - continue - } - - nodeName := columns[0] - length, err := strconv.Atoi(columns[1]) - if err != nil { - log.Printf("Invalid length for node %s: %v", nodeName, err) - continue - } - - if length > 1 { - data[nodeName] = true - log.Printf("Valid node: %s (length: %d)", nodeName, length) - } - } - - return data, nil -} - -// filterNodes filters nodes based on differences in coverage and node lengths -func filterNodes(data map[string][]string, validNodes map[string]bool) map[string][]string { - filtered := make(map[string][]string) - filtered["path.name"] = data["path.name"] - - for id, values := range data { - if id == "path.name" { - continue - } - - // Check node length validity - if _, ok := validNodes[id]; !ok { - log.Printf("Discarded node %s: too small", id) - continue - } - - // Check for differences in the column - var diffExists bool - for i := 1; i < len(values); i++ { - if values[i] != values[i-1] { - diffExists = true - break - } - } - - if diffExists { - filtered[id] = values - } else { - log.Printf("Discarded node %s: no difference", id) - } - } - - return filtered -} - -// generateCombinations creates all unique group pair combinations -func generateCombinations(groups []string) [][2]string { - var combinations [][2]string - for i := 0; i < len(groups); i++ { - for j := i + 1; j < len(groups); j++ { - combinations = append(combinations, [2]string{groups[i], groups[j]}) - } - } - return combinations -} - -// computeDistances calculates various distance metrics concurrently -func computeDistances(data map[string][]string, combinations [][2]string) []Result { - results := make([]Result, 0, len(combinations)) - var mu sync.Mutex - var wg sync.WaitGroup - - for _, comb := range combinations { - wg.Add(1) - go func(pair [2]string) { - defer wg.Done() - - groupA, groupB := pair[0], pair[1] - h1 := extractValues(data, groupA) - h2 := extractValues(data, groupB) - - // Parse values into floats - v1 := make([]float64, len(h1)) - v2 := make([]float64, len(h2)) - for i := range h1 { - v1[i], _ = strconv.ParseFloat(h1[i], 64) - v2[i], _ = strconv.ParseFloat(h2[i], 64) - } - - // Compute Euclidean distance - eucDist := computeEuclideanDistance(v1, v2) - - // Compute Jaccard distance - jaccDist := computeJaccardDistance(h1, h2) - - // Compute Cosine Dissimilarity - cosineDissim := computeCosineDissimilarity(v1, v2) - - // Compute Manhattan distance - manhattanDist := computeManhattanDistance(v1, v2) - - // Add result to the list - res := Result{ - GroupA: groupA, - GroupB: groupB, - EuclideanDist: eucDist, - JaccardDist: jaccDist, - CosineDissim: cosineDissim, - ManhattanDist: manhattanDist, - } - - mu.Lock() - results = append(results, res) - mu.Unlock() - }(comb) - } - - wg.Wait() - return results -} - -// extractValues retrieves values for a specific group -func extractValues(data map[string][]string, group string) []string { - // Find the row index where "path.name" matches the group - rowIndex := -1 - for i, name := range data["path.name"] { - if strings.TrimSpace(name) == strings.TrimSpace(group) { - rowIndex = i - break - } - } - - if rowIndex == -1 { - log.Fatalf("Group %s not found in path.name", group) - } - - // Extract and sort node columns - var nodeColumns []string - for key := range data { - if strings.HasPrefix(key, "node.") { - nodeColumns = append(nodeColumns, key) - } - } - - sort.Slice(nodeColumns, func(i, j int) bool { - numI, _ := strconv.Atoi(strings.TrimPrefix(nodeColumns[i], "node.")) - numJ, _ := strconv.Atoi(strings.TrimPrefix(nodeColumns[j], "node.")) - return numI < numJ - }) - - // Extract values for sorted node columns - var values []string - for _, key := range nodeColumns { - values = append(values, data[key][rowIndex]) - } - - return values -} - -// Computation helper functions -func computeEuclideanDistance(v1, v2 []float64) float64 { - var sumSquared float64 - for i := range v1 { - diff := v1[i] - v2[i] - sumSquared += diff * diff - } - return math.Sqrt(sumSquared) -} - -func computeJaccardDistance(h1, h2 []string) float64 { - intersection := 0 - union := len(h1) + len(h2) - for i := range h1 { - if h1[i] == h2[i] { - intersection++ - } - } - return 1.0 - float64(intersection)/float64(union-intersection) -} - -func computeCosineDissimilarity(v1, v2 []float64) float64 { - dotProduct, magA, magB := 0.0, 0.0, 0.0 - for i := range v1 { - dotProduct += v1[i] * v2[i] - magA += v1[i] * v1[i] - magB += v2[i] * v2[i] - } - magA, magB = math.Sqrt(magA), math.Sqrt(magB) - cosineSim := dotProduct / (magA * magB) - return 1 - cosineSim -} - -func computeManhattanDistance(v1, v2 []float64) float64 { - var sumAbsDiff float64 - for i := range v1 { - sumAbsDiff += math.Abs(v1[i] - v2[i]) - } - return sumAbsDiff -} \ No newline at end of file