From 4fcea9c6e8135e4736c6b8b7d0194ed325111d4d Mon Sep 17 00:00:00 2001 From: Akash Bahai Date: Mon, 28 Aug 2023 17:51:54 +0800 Subject: [PATCH 1/3] Process MSA to remove non-standard bases --- run_RF2NA.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/run_RF2NA.sh b/run_RF2NA.sh index add3c8d..16429bd 100755 --- a/run_RF2NA.sh +++ b/run_RF2NA.sh @@ -16,8 +16,8 @@ SCRIPT=`realpath -s $0` export PIPEDIR=`dirname $SCRIPT` HHDB="$PIPEDIR/pdb100_2021Mar03/pdb100_2021Mar03" -CPU="8" # number of CPUs to use -MEM="64" # max memory (in GB) +CPU="128" # number of CPUs to use +MEM="500" # max memory (in GB) WDIR=`realpath -s $1` # working folder mkdir -p $WDIR/log @@ -106,6 +106,12 @@ do fi done +############################################################ +# Clean RNA msa from non-standard bases +############################################################ +echo "CLeaning RNA MSA from non-standard bases" +$PIPEDIR/process_msa_rf.sh $WDIR/$tag.afa + ############################################################ # Merge MSAs based on taxonomy ID ############################################################ From cc6deb0dcc968536a07f104d1f33965204c311a5 Mon Sep 17 00:00:00 2001 From: Akash Bahai Date: Mon, 28 Aug 2023 17:53:19 +0800 Subject: [PATCH 2/3] Bashscript to process and clean the MSA --- process_msa_rf.sh | 109 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 process_msa_rf.sh diff --git a/process_msa_rf.sh b/process_msa_rf.sh new file mode 100644 index 0000000..a5712f2 --- /dev/null +++ b/process_msa_rf.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Check if a character is a non-standard base, gap, or ambiguous base (N) +function is_non_standard_character() { + local char=$1 + case "$char" in + A|U|G|C|-|N) + return 1 # Standard base, gap, or ambiguous base found + ;; + *) + return 0 # Non-standard base, gap, or ambiguous base found + ;; + esac +} + +# Check the input file and handle errors +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +input_file="$1" + +# Check if the input file exists +if [ ! -f "$input_file" ]; then + echo "Error: The input file '$input_file' does not exist." + exit 1 +fi + +# Extract the directory path from the input file's absolute path +input_dir=$(dirname "$input_file") + +# Create a log file with the same name as the input file but with .log.txt extension +log_file="${input_dir}/$(basename "${input_file%.*}")_log.txt" + +# Create a temporary file to store the new sequences in the same directory +temp_output_file="${input_dir}/$(basename "${input_file%.*}")_temp.afa" + +# Redirect all output to the log file +exec > "$log_file" + +# Initialize variables to store the current header and sequence +current_header="" +current_sequence="" + +# Process the input file line by line +while IFS= read -r line; do + if [[ "$line" =~ ^\> ]]; then + # Process the previous sequence before moving to the new header + if [ -n "$current_header" ] && [ -n "$current_sequence" ]; then + # Check each character in the sequence for non-standard bases, gaps, or ambiguous bases + skip_sequence=false + for (( i=0; i<${#current_sequence}; i++ )); do + char=${current_sequence:i:1} + if is_non_standard_character "$char"; then + echo "Non-standard character '$char' found in the sequence:" + echo "$current_header" + echo "$current_sequence" + skip_sequence=true + break + fi + done + + # Write to the temporary output file if the sequence contains no non-standard characters + if ! $skip_sequence; then + echo "$current_header" >> "$temp_output_file" + echo "$current_sequence" >> "$temp_output_file" + fi + fi + + # Save the new header and reset the current sequence + current_header="$line" + current_sequence="" + else + # Accumulate sequence lines under the same header + current_sequence+="$line" + fi +done < "$input_file" + +# Process the last sequence after reaching the end of the file +if [ -n "$current_header" ] && [ -n "$current_sequence" ]; then + # Check each character in the sequence for non-standard bases, gaps, or ambiguous bases + skip_sequence=false + for (( i=0; i<${#current_sequence}; i++ )); do + char=${current_sequence:i:1} + if is_non_standard_character "$char"; then + echo "Non-standard character '$char' found in the sequence:" + echo "$current_header" + echo "$current_sequence" + skip_sequence=true + break + fi + done + + # Write to the temporary output file if the sequence contains no non-standard characters + if ! $skip_sequence; then + echo "$current_header" >> "$temp_output_file" + echo "$current_sequence" >> "$temp_output_file" + fi +fi + +# Rename the old input file to basename_old.afa in the same directory +mv "$input_file" "${input_dir}/$(basename "${input_file%.*}")_old.afa" + +# Rename the temporary file to the original input file in the same directory +mv "$temp_output_file" "$input_file" + +# Provide a summary message in the log file +echo "Processed $input_file. Removed sequences with non-standard bases and saved the results in $input_file." >> "$log_file" From 395fd536d88ca9b0574c90ec5a066631f6d82d3a Mon Sep 17 00:00:00 2001 From: Akash Bahai Date: Sat, 6 Apr 2024 09:21:29 +0200 Subject: [PATCH 3/3] Update run_RF2NA.sh --- run_RF2NA.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_RF2NA.sh b/run_RF2NA.sh index 16429bd..a2b8e9a 100755 --- a/run_RF2NA.sh +++ b/run_RF2NA.sh @@ -16,8 +16,8 @@ SCRIPT=`realpath -s $0` export PIPEDIR=`dirname $SCRIPT` HHDB="$PIPEDIR/pdb100_2021Mar03/pdb100_2021Mar03" -CPU="128" # number of CPUs to use -MEM="500" # max memory (in GB) +CPU="8" # number of CPUs to use +MEM="64" # max memory (in GB) WDIR=`realpath -s $1` # working folder mkdir -p $WDIR/log