From 4fcea9c6e8135e4736c6b8b7d0194ed325111d4d Mon Sep 17 00:00:00 2001
From: Akash Bahai <akashbahai@gmail.com>
Date: Mon, 28 Aug 2023 17:51:54 +0800
Subject: [PATCH 1/3] Process MSA to remove non-standard bases

---
 run_RF2NA.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/run_RF2NA.sh b/run_RF2NA.sh
index add3c8d..16429bd 100755
--- a/run_RF2NA.sh
+++ b/run_RF2NA.sh
@@ -16,8 +16,8 @@ SCRIPT=`realpath -s $0`
 export PIPEDIR=`dirname $SCRIPT`
 HHDB="$PIPEDIR/pdb100_2021Mar03/pdb100_2021Mar03"
 
-CPU="8"  # number of CPUs to use
-MEM="64" # max memory (in GB)
+CPU="128"  # number of CPUs to use
+MEM="500" # max memory (in GB)
 
 WDIR=`realpath -s $1`  # working folder
 mkdir -p $WDIR/log
@@ -106,6 +106,12 @@ do
     fi
 done
 
+############################################################
+# Clean RNA msa from non-standard bases
+############################################################
+echo "CLeaning RNA MSA from non-standard bases"
+$PIPEDIR/process_msa_rf.sh $WDIR/$tag.afa
+
 ############################################################
 # Merge MSAs based on taxonomy ID
 ############################################################

From cc6deb0dcc968536a07f104d1f33965204c311a5 Mon Sep 17 00:00:00 2001
From: Akash Bahai <akashbahai@gmail.com>
Date: Mon, 28 Aug 2023 17:53:19 +0800
Subject: [PATCH 2/3] Bashscript to process and clean the MSA

---
 process_msa_rf.sh | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 process_msa_rf.sh

diff --git a/process_msa_rf.sh b/process_msa_rf.sh
new file mode 100644
index 0000000..a5712f2
--- /dev/null
+++ b/process_msa_rf.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# Check if a character is a non-standard base, gap, or ambiguous base (N)
+function is_non_standard_character() {
+    local char=$1
+    case "$char" in
+        A|U|G|C|-|N)
+            return 1 # Standard base, gap, or ambiguous base found
+            ;;
+        *)
+            return 0 # Non-standard base, gap, or ambiguous base found
+            ;;
+    esac
+}
+
+# Check the input file and handle errors
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <input_file.afa>"
+    exit 1
+fi
+
+input_file="$1"
+
+# Check if the input file exists
+if [ ! -f "$input_file" ]; then
+    echo "Error: The input file '$input_file' does not exist."
+    exit 1
+fi
+
+# Extract the directory path from the input file's absolute path
+input_dir=$(dirname "$input_file")
+
+# Create a log file with the same name as the input file but with .log.txt extension
+log_file="${input_dir}/$(basename "${input_file%.*}")_log.txt"
+
+# Create a temporary file to store the new sequences in the same directory
+temp_output_file="${input_dir}/$(basename "${input_file%.*}")_temp.afa"
+
+# Redirect all output to the log file
+exec > "$log_file"
+
+# Initialize variables to store the current header and sequence
+current_header=""
+current_sequence=""
+
+# Process the input file line by line
+while IFS= read -r line; do
+    if [[ "$line" =~ ^\> ]]; then
+        # Process the previous sequence before moving to the new header
+        if [ -n "$current_header" ] && [ -n "$current_sequence" ]; then
+            # Check each character in the sequence for non-standard bases, gaps, or ambiguous bases
+            skip_sequence=false
+            for (( i=0; i<${#current_sequence}; i++ )); do
+                char=${current_sequence:i:1}
+                if is_non_standard_character "$char"; then
+                    echo "Non-standard character '$char' found in the sequence:"
+                    echo "$current_header"
+                    echo "$current_sequence"
+                    skip_sequence=true
+                    break
+                fi
+            done
+
+            # Write to the temporary output file if the sequence contains no non-standard characters
+            if ! $skip_sequence; then
+                echo "$current_header" >> "$temp_output_file"
+                echo "$current_sequence" >> "$temp_output_file"
+            fi
+        fi
+
+        # Save the new header and reset the current sequence
+        current_header="$line"
+        current_sequence=""
+    else
+        # Accumulate sequence lines under the same header
+        current_sequence+="$line"
+    fi
+done < "$input_file"
+
+# Process the last sequence after reaching the end of the file
+if [ -n "$current_header" ] && [ -n "$current_sequence" ]; then
+    # Check each character in the sequence for non-standard bases, gaps, or ambiguous bases
+    skip_sequence=false
+    for (( i=0; i<${#current_sequence}; i++ )); do
+        char=${current_sequence:i:1}
+        if is_non_standard_character "$char"; then
+            echo "Non-standard character '$char' found in the sequence:"
+            echo "$current_header"
+            echo "$current_sequence"
+            skip_sequence=true
+            break
+        fi
+    done
+
+    # Write to the temporary output file if the sequence contains no non-standard characters
+    if ! $skip_sequence; then
+        echo "$current_header" >> "$temp_output_file"
+        echo "$current_sequence" >> "$temp_output_file"
+    fi
+fi
+
+# Rename the old input file to basename_old.afa in the same directory
+mv "$input_file" "${input_dir}/$(basename "${input_file%.*}")_old.afa"
+
+# Rename the temporary file to the original input file in the same directory
+mv "$temp_output_file" "$input_file"
+
+# Provide a summary message in the log file
+echo "Processed $input_file. Removed sequences with non-standard bases and saved the results in $input_file." >> "$log_file"

From 395fd536d88ca9b0574c90ec5a066631f6d82d3a Mon Sep 17 00:00:00 2001
From: Akash Bahai <akashbahai@gmail.com>
Date: Sat, 6 Apr 2024 09:21:29 +0200
Subject: [PATCH 3/3] Update run_RF2NA.sh

---
 run_RF2NA.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_RF2NA.sh b/run_RF2NA.sh
index 16429bd..a2b8e9a 100755
--- a/run_RF2NA.sh
+++ b/run_RF2NA.sh
@@ -16,8 +16,8 @@ SCRIPT=`realpath -s $0`
 export PIPEDIR=`dirname $SCRIPT`
 HHDB="$PIPEDIR/pdb100_2021Mar03/pdb100_2021Mar03"
 
-CPU="128"  # number of CPUs to use
-MEM="500" # max memory (in GB)
+CPU="8"  # number of CPUs to use
+MEM="64" # max memory (in GB)
 
 WDIR=`realpath -s $1`  # working folder
 mkdir -p $WDIR/log