fix parselogs merge df

ggabernet · Oct 1, 2024 · 54a63f4 · 54a63f4
1 parent a8baf6a
commit 54a63f4
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,14 +10,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - [#334](https://github.com/nf-core/airrflow/pull/334) Added TRUST4 support.
 - [#344](https://github.com/nf-core/airrflow/pull/344) Added option to remove V and C primer region when sequence is unknown.
 - [#344](https://github.com/nf-core/airrflow/pull/344) Added option to save non-productive sequences
+- [#344](https://github.com/nf-core/airrflow/pull/344) Allow for maskprimers align with UMI on both V gene and C gene primers.
 
 ### `Fixed`
 
 - [#344](https://github.com/nf-core/airrflow/pull/344) Avoid saving pRESTO intermediate fastq files in results directory.
 - [#344](https://github.com/nf-core/airrflow/pull/344) Simplified pRESTO Maskprimers score and Maskprimers extract processes.
 - [#344](https://github.com/nf-core/airrflow/pull/344) Fix clustersets approach, consider both reads clusters annotation.
 - [#344](https://github.com/nf-core/airrflow/pull/344) Allow for partial alignments in MakeDB for mouse TCR data, to bypass junction check for TRAV germlines with additional positions (e.g. 84A).
-- [#344](https://github.com/nf-core/airrflow/pull/344) Allow for maskprimers align with UMI on both V gene and C gene primers.
+- [#344](https://github.com/nf-core/airrflow/pull/344) Fix parse logs merging dataframes using pandas join to produce logs even when there are failing samples.
+
 
 ### `Dependencies`
 

diff --git a/bin/log_parsing.py b/bin/log_parsing.py
@@ -244,7 +244,7 @@
             with open(logfile, "r") as f:
                 for line in f:
                     if "PASS>" in line:
-                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
+                        s_code.append(logfile.split("/")[1].split("_makedb_command_log")[0])
                         pass_blast.append(line.strip().removeprefix("PASS> "))
                     elif "FAIL>" in line:
                         fail_blast.append(line.strip().removeprefix("FAIL> "))
@@ -368,7 +368,6 @@
 # Getting table colnames
 
 colnames = [
-    "Sample",
     "Sequences",
     "Filtered_quality_R1",
     "Filtered_quality_R2",
@@ -385,23 +384,24 @@
 print(df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype"))
 
 values = [
-    df_process_list[2].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
-    df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
-    df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
-    df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
+    df_process_list[0].pivot(index="Sample", columns="readtype")["start"]["R1"],
+    df_process_list[0].pivot(index="Sample", columns="readtype")["pass"]["R1"],
+    df_process_list[0].pivot(index="Sample", columns="readtype")["pass"]["R2"],
+    df_process_list[1].pivot(index="Sample", columns="readtype")["pass"]["R1"],
+    df_process_list[1].pivot(index="Sample", columns="readtype")["pass"]["R2"],
+    df_process_list[2].set_index("Sample").loc[:, "pass_pairs"],
+    df_process_list[4].set_index("Sample").loc[:, "pass_pairs"],
+    df_process_list[5].set_index("Sample").loc[:, "pass_pairs"],
+    df_process_list[6].set_index("Sample").loc[:, "unique"],
+    df_process_list[7].set_index("Sample").loc[:, "repres_2"],
+    df_process_list[7].set_index("Sample").loc[:, "pass_igblast"],
 ]
 
+final_table = pd.concat(values, axis=1, join="outer")
+final_table.columns = colnames
+final_table = final_table.reset_index().rename(columns={"index": "Sample"})
+
 
-final_table = dict(zip(colnames, values))
-print(final_table)
 df_final_table = pd.DataFrame.from_dict(final_table)
 df_final_table = df_final_table.sort_values(["Sample"], ascending=[1])