diff --git a/CHANGES.md b/CHANGES.md index 9852536..76ea096 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,8 @@ v0.2.11 (dev) ============= + more informative error message on bad sample name (#53) + allow setting SOMALIER_AB_HOM_CUTOFF to change which calls are considered hom-ref (#56) ++ adjust (fix) relatedness calculation which was off when the number of shared sites was low (#55). + many thanks to @fgvieira who found and diagnosed this problem. v0.2.10 ======= diff --git a/src/somalierpkg/bitset.nim b/src/somalierpkg/bitset.nim index 9e8fe72..12f3321 100644 --- a/src/somalierpkg/bitset.nim +++ b/src/somalierpkg/bitset.nim @@ -15,12 +15,13 @@ proc set*(b:var bitset, i:SomeInteger) {.inline.} = b[idx] = b[idx] or (1'u64 shl (i mod 64)) type genotypes* = tuple[hom_ref:bitset, het:bitset, hom_alt:bitset] -type IBSResult* = tuple[IBS0: int32, IBS2:int32, N:int32, shared_hets:int32, shared_hom_alts:int32] +type IBSResult* = tuple[IBS0: int32, IBS2:int32, N:int32, shared_hets:int32, shared_hom_alts:int32, het_ab:int32] proc IBS*(a: genotypes, b: genotypes): IBSResult = doAssert a.hom_ref.len == b.hom_ref.len var sh_het:uint64 # shared het var sh_ha:uint64 # shared hom-alt + var n:uint64 for k in 0..a.hom_ref.high: result.IBS0 += countSetBits((a.hom_ref[k] and b.hom_alt[k]) or (a.hom_alt[k] and b.hom_ref[k])).int32 sh_het = a.het[k] and b.het[k] @@ -28,7 +29,14 @@ proc IBS*(a: genotypes, b: genotypes): IBSResult = result.IBS2 += countSetBits(sh_ha or sh_het or (a.hom_ref[k] and b.hom_ref[k])).int32 result.shared_hets += countSetBits(sh_het).int32 result.shared_hom_alts += countSetBits(sh_ha).int32 - result.N += countSetBits((a.hom_ref[k] or a.het[k] or a.hom_alt[k]) and (b.hom_ref[k] or b.het[k] or b.hom_alt[k])).int32 + n = (a.hom_ref[k] or a.het[k] or a.hom_alt[k]) and (b.hom_ref[k] or b.het[k] or b.hom_alt[k]) + # het_ab is count of sites that are het in either sample and not unknown in + # the other sample. see equation 9 and 7 from king paper (PMID: 20926424) where they + # state: + # where NAa,Aa, NAa(i) and NAa(j) are the total numbers of SNPs at which both individuals of the pair are heterozygous, and the total number of heterozygotes for the i-th and j-th individual, respectively, excluding those SNPs with missing genotypes in either individual of the pair. + # Here, NAa(i) and NAa(j) are a.het[k] and b.het[k]. + result.het_ab += (countSetBits(b.het[k] and n) + countSetBits(a.het[k] and n)).int32 + result.N += countSetBits(n).int32 proc XIBS*(a: genotypes, b: genotypes): IBSResult = doAssert a.hom_ref.len == b.hom_ref.len diff --git a/src/somalierpkg/relate.nim b/src/somalierpkg/relate.nim index 6e83257..51e59cd 100644 --- a/src/somalierpkg/relate.nim +++ b/src/somalierpkg/relate.nim @@ -57,6 +57,7 @@ type relation = object hom_alts_b: uint16 shared_hom_alts: uint16 shared_hets: uint16 + het_ab: uint16 ibs0: uint16 ibs2: uint16 x_ibs0: uint16 @@ -85,7 +86,7 @@ proc hom_alt_concordance(r: relation): float64 {.inline.} = return (r.shared_hom_alts.float64 - 2 * r.ibs0.float64) / max(1'u16, min(r.hom_alts_a, r.hom_alts_b)).float64 proc rel(r:relation): float64 {.inline.} = - return (r.shared_hets.float64 - 2 * r.ibs0.float64) / min(r.hets_a, r.hets_b).float64 + return 2 * (r.shared_hets.float64 - 2 * r.ibs0.float64) / r.het_ab.float64 proc add*(rt:var seq[relations], rel:relation, expected_relatedness:float) = @@ -247,6 +248,7 @@ proc relatedness(r: var relation_matrices, j: int, k:int): relation {.inline.} = r.n[j * r.n_samples + k] = ir.N.uint16 r.n[k * r.n_samples + j] = ir.IBS2.uint16 r.shared_hom_alts[j * r.n_samples + k] = ir.shared_hom_alts.uint16 + r.shared_hom_alts[k * r.n_samples + j] = ir.het_ab.uint16 let xir = r.x_genotypes[j].XIBS(r.x_genotypes[k]) r.x[j * r.n_samples + k] = xir.IBS0.uint16 @@ -261,6 +263,7 @@ proc relatedness(r: var relation_matrices, j: int, k:int): relation {.inline.} = shared_hom_alts: ir.shared_hom_alts.uint16, ibs2: ir.IBS2.uint16, n: ir.N.uint16, + het_ab: ir.het_ab.uint16, x_ibs0: xir.IBS0.uint16, x_ibs2: xir.IBS2.uint16, )