seqimpSummarise

#!/bin/bash
#	Tommaso Leonardi, 2016
#	Reads a SeqIMP description files and produces 
#	a properly formatted summary table of counts.

# Check proper number of arguments 
if [ ! $# == 2 ]
  then
    echo "
Usage: merge_count_files.sh analysis_folder description.txt
	analysis folder: path to the analysis folder created by SequenceImp
	description.txt: description file submitted to SequenceImp
    "
    exit
fi

# Check if the analysis directory exists
if [ ! -d $1 ]
then
    	echo "$1: doesn't exist or is not a directory"
	echo "Please provide a valid path to the analysis directory"
	exit 1
fi
ANALYSIS_DIR=$1

# Check if the description file exists
if [ ! -f $2 ]
then
    	echo "$2: No such file"
	echo "Please provide a valid description file"
	exit 1
fi
DESCRIPTION_FILE=$2

#Load file names in array
FILES=($(awk 'NR>1{printf("%s ",$1)}' $DESCRIPTION_FILE ))

# Check that all count files exist
for i in "${FILES[@]}"; do
	if [ ! -f $ANALYSIS_DIR/$i/miRNA_ANALYSIS/$i.mature.counts.txt ]
	then
    		echo "$ANALYSIS_DIR/$i/miRNA_ANALYSIS/$i.mature.counts.txt: No such file"
		echo "Please make sure that all files described in $2 have a corresponding count files"
	exit 1
	fi
done;


TMP="$(mktemp -p /tmp/)"
TMP2="$(mktemp -p /tmp/)"

function cleanup {
rm -f $TMP
rm -f $TMP2
}
trap cleanup EXIT

# Join the first two files to TMP
join -1 4 -2 4 -a 1 -t $'\t' <(sort -k4,4 $ANALYSIS_DIR/${FILES[0]}/miRNA_ANALYSIS/${FILES[0]}.mature.counts.txt ) <(sort -k4,4 $ANALYSIS_DIR/${FILES[1]}/miRNA_ANALYSIS/${FILES[1]}.mature.counts.txt ) > $TMP

# For each subsequent file, join it with TMP
for i in $(seq 2 $(expr ${#FILES[@]} - 1)); do
	join -1 1 -2 4 -a 1 -t $'\t' $TMP <(sort -k4,4 $ANALYSIS_DIR/${FILES[$i]}/miRNA_ANALYSIS/${FILES[$i]}.mature.counts.txt ) > $TMP2
	mv $TMP2 $TMP
done;

# Do some basic error checking making sure that all the 'precursor' columns contain the same value
awk -v FILES="$(echo ${FILES[@]})" 'BEGIN{OFS=FS="\t";NFILES=split(FILES,F," ")} {PRE=$2; for(i=8;i < NFILES*6+1;i+=6){if($i!=PRE){printf("There was an error with matching lines: %s\n",NR) | "cat >&2" ;exit 1}}}' $TMP
if [ $? -ne 0 ]; then
	echo "Exiting" 
	exit 1;
fi

# Print only the interesting columns with an appropriate header
awk -v FILES="$(echo ${FILES[@]})" 'BEGIN{OFS=FS="\t";NFILES=split(FILES,F," ")} $1=="Unique_ID"{printf("Unique_ID\tMature\tPrecursor");for(i=1;i<=NFILES;i++){printf("\t%s_Mapped_Reads\t%s_Unique_Reads",F[i],F[i])}printf("\n")};$1!="Unique_ID"{printf("%s\t%s\t%s",$1,$2,$3); for(i=6;i < NFILES*6+1;i+=6){MAPPED=i;UNIQUE=i+1;printf("\t%s\t%s",$(MAPPED),$(UNIQUE))}printf("\n")}' $TMP