From 4285d6656a5c0db3b38fb38211297dfce24a75d0 Mon Sep 17 00:00:00 2001 From: Nathaniel Pope Date: Tue, 5 Mar 2019 15:48:01 -0800 Subject: [PATCH 1/2] added mutual information via whichFst --- misc/realSFS.cpp | 18 +++++++++++++++++- misc/safstat.cpp | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/misc/realSFS.cpp b/misc/realSFS.cpp index 71f57e83..0fed309d 100644 --- a/misc/realSFS.cpp +++ b/misc/realSFS.cpp @@ -404,7 +404,23 @@ int fst_index(int argc,char **argv){ int fst(int argc,char**argv){ if(argc==0){ - fprintf(stderr,"\t-> Possible options: index print\n"); + fprintf(stderr,"realSFS fst [index|print|stats|stats2]\n"); + fprintf(stderr,"--------------------------------------\n"); + fprintf(stderr,"\tindex [.saf.idx .saf.idx ...] -sfs [.sfs] -fstout [] -whichFst 0\n"); + fprintf(stderr,"\t\t-sfs \tFlattened pairwise 2d site frequency spectra (if more than two populations, use multiple times)\n"); + fprintf(stderr,"\t\t-fstout \tOutput prefix\n"); + fprintf(stderr,"\t\t-whichFst\t0: default, Fst estimator from Reynolds et al (1983) Genetics 3: 767-779\n"); + fprintf(stderr,"\t\t \t1: Fst estimator from Bhatia et al (2013) Genome Res 23: 1514-1521\n"); + fprintf(stderr,"\t\t \t2: mutual information/Shannon differentiation (not an Fst estimator)\n"); + fprintf(stderr,"\tprint [.fst.idx]\n"); + fprintf(stderr,"\tstats [.fst.idx]\n"); + fprintf(stderr,"\tstats2 [.fst.idx]\n"); + fprintf(stderr,"\t\t-win \tWindow size\n"); + fprintf(stderr,"\t\t-step \tStep size, e.g. shift in bp from one window to next\n"); + fprintf(stderr,"\t\t-type \t0: default, split genome into blocks and use the first window that has data for the entire window\n"); + fprintf(stderr,"\t\t \t1: use first position with data as leftmost position of first window\n"); + fprintf(stderr,"\t\t \t2: use first coordinate as leftmost position of first window, regardless of whether there is data\n"); + fprintf(stderr,"Examples and output formats at www.popgen.dk/angsd/index.php/Fst\n"); return 0; } if(!strcasecmp(*argv,"index")) diff --git a/misc/safstat.cpp b/misc/safstat.cpp index 0df6d3dc..9f25a754 100644 --- a/misc/safstat.cpp +++ b/misc/safstat.cpp @@ -72,11 +72,52 @@ void bhatiaFst(int sfs1,int sfs2,double **aMat,double **bMat){ at++; } } +//nspope; mutual information across 2d sfs bins +//the mutual information is saved in aMat +//the maximum possible mutual information is saved in bMat +//thus fst_stat calculates the normalized mutual information (shannon differentiation) which is bounded [0,1] +//note that mutual information isn't fst, although the "labels" from fst_stat will say fst +void mutualInf(int sfs1,int sfs2,double **aMat,double **bMat){ + fprintf(stderr,"\t-> [%s] sfs1:%d sfs2:%d dimspace:%d \n",__FUNCTION__,sfs1,sfs2,(sfs1+1)*(sfs2+1)); + fprintf(stderr,"\t-> WARNING! calculating mutual information instead of FST, pbs statistics will be meaningless"); + *aMat = new double[(sfs1+1)*(sfs2+1)]; + *bMat = new double[(sfs1+1)*(sfs2+1)]; + double rat = static_cast(sfs1)/static_cast(sfs1+sfs2); + double K = -rat * log(rat) - (1.-rat) * log(1.-rat); + int at=0; + for(int a1=0;a1<=sfs1;a1++) + for(int a2=0;a2<=sfs2;a2++){ + double p1 = static_cast(a1); + double p2 = static_cast(a2); + double q1 = static_cast(sfs1-a1); + double q2 = static_cast(sfs2-a2); + double z1 = static_cast(a1+a2); + double z2 = static_cast(sfs1+sfs2)-z1; + double f11 = z1*rat; + double f12 = z2*rat; + double f21 = z1*(1.-rat); + double f22 = z2*(1.-rat); + double lrt = 0.; + if (p1*f11 > 0.0) lrt += p1*log(p1/f11); + if (p2*f21 > 0.0) lrt += p2*log(p2/f21); + if (q1*f12 > 0.0) lrt += q1*log(q1/f12); + if (q2*f22 > 0.0) lrt += q2*log(q2/f22); + lrt /= (z2 + z1); + + (*aMat)[at] = lrt; + (*bMat)[at] = K; + + //fprintf(stderr,"(%d,%d) al:%f bal:%f\n",a1,a2,(*aMat)[at],(*bMat)[at]); + at++; + } +} void calcCoef(int sfs1,int sfs2,double **aMat,double **bMat,int whichFst){ if(whichFst==0) reynoldFst(sfs1,sfs2,aMat,bMat); else if(whichFst==1) bhatiaFst(sfs1,sfs2,aMat,bMat); + else if(whichFst==2)//nspope + mutualInf(sfs1,sfs2,aMat,bMat); else{ fprintf(stderr,"\t-> Fst option is not implemented\n"); exit(0); @@ -171,7 +212,7 @@ int fst_print(int argc,char **argv){ for(size_t s=first;sfirst,ppos[s]+1); for(int i=0;inames.size(),2);i++) - fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); + fprintf(stdout,"\t%.7g\t%.7g",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); } for(int i=0;inames.size(),2);i++){ From 8c9525b17787dae336826b32001112475d60d731 Mon Sep 17 00:00:00 2001 From: Nathaniel Pope Date: Tue, 5 Mar 2019 21:52:54 -0800 Subject: [PATCH 2/2] fing newlines --- misc/safstat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/safstat.cpp b/misc/safstat.cpp index 9f25a754..df8b2f5c 100644 --- a/misc/safstat.cpp +++ b/misc/safstat.cpp @@ -79,7 +79,7 @@ void bhatiaFst(int sfs1,int sfs2,double **aMat,double **bMat){ //note that mutual information isn't fst, although the "labels" from fst_stat will say fst void mutualInf(int sfs1,int sfs2,double **aMat,double **bMat){ fprintf(stderr,"\t-> [%s] sfs1:%d sfs2:%d dimspace:%d \n",__FUNCTION__,sfs1,sfs2,(sfs1+1)*(sfs2+1)); - fprintf(stderr,"\t-> WARNING! calculating mutual information instead of FST, pbs statistics will be meaningless"); + fprintf(stderr,"\t-> WARNING! calculating mutual information instead of FST, pbs statistics will be meaningless\n"); *aMat = new double[(sfs1+1)*(sfs2+1)]; *bMat = new double[(sfs1+1)*(sfs2+1)]; double rat = static_cast(sfs1)/static_cast(sfs1+sfs2);