Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge dev into master #12

Merged
merged 4 commits into from
Oct 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ func alignParamCheck() error {
func runAlign() {
// set up profiling
if *profiling == true {
//defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
defer profile.Start(profile.ProfilePath("./")).Stop()
defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
//defer profile.Start(profile.ProfilePath("./")).Stop()
}
// start logging
logFH := misc.StartLogging(*logFile)
Expand Down
14 changes: 9 additions & 5 deletions cmd/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ var (
outDir *string // directory to save index files and log to
defaultOutDir = "./groot-index-" + string(time.Now().Format("20060102150405")) // a default dir to store the index files
containment *bool // use lshEnsemble instead of lshForest -- allows for variable read length
maxK *int // the maxK for LSH Ensemble (only active for --containment)
numPart *int // the number of partitions for LSH Ensemble (only active for --containment)
)

// the index command (used by cobra)
Expand All @@ -69,13 +71,15 @@ var indexCmd = &cobra.Command{

// a function to initialise the command line arguments
func init() {
kSize = indexCmd.Flags().IntP("kmerSize", "k", 7, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 128, "size of MinHash signature")
kSize = indexCmd.Flags().IntP("kmerSize", "k", 31, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 42, "size of MinHash signature")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "max length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded (note: this is used as a containment theshold when --containment set")
msaDir = indexCmd.Flags().StringP("msaDir", "i", "", "directory containing the clustered references (MSA files) - required")
outDir = indexCmd.PersistentFlags().StringP("outDir", "o", defaultOutDir, "directory to save index files to")
containment = indexCmd.Flags().BoolP("containment", "c", false, "use lshEnsemble instead of lshForest (allows for variable read length during alignment)")
maxK = indexCmd.Flags().IntP("maxK", "m", 4, "maxK in LSH Ensemble (only active with --containment)")
numPart = indexCmd.Flags().IntP("numPart", "n", 4, "num. partitions in LSH Ensemble (only active with --containment)")
indexCmd.MarkFlagRequired("msaDir")
RootCmd.AddCommand(indexCmd)
}
Expand Down Expand Up @@ -239,10 +243,10 @@ func runIndex() {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH ensemble (https://github.com/ekzhu/lshensemble)
log.Printf("running LSH Ensemble...\n")
database = lshIndex.BootstrapLshEnsemble(lshIndex.PARTITIONS, *sigSize, lshIndex.MAXK, numSigs, lshIndex.Windows2Chan(sigStore))
database = lshIndex.BootstrapLshEnsemble(*numPart, *sigSize, *maxK, numSigs, lshIndex.Windows2Chan(sigStore))
// print some stuff
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", lshIndex.PARTITIONS)
log.Printf("\tmax no. hash functions per bucket: %d\n", lshIndex.MAXK)
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", *numPart)
log.Printf("\tmax no. hash functions per bucket: %d\n", *maxK)
}
// attach the key lookup map to the index
database.KeyLookup = lookupMap
Expand Down
4 changes: 3 additions & 1 deletion src/graph/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/biogo/hts/sam"
"github.com/will-rowe/gfa"
"github.com/will-rowe/groot/src/seqio"
"github.com/will-rowe/groot/src/misc"
)

/*
Expand Down Expand Up @@ -250,7 +251,8 @@ func (graph *GrootGraph) WindowGraph(windowSize, kSize, sigSize int) <-chan *Win
for path := range sendPath {
// get a minhash signature
windowSeq := seqio.Sequence{Seq: path}
sig := windowSeq.RunMinHash(kSize, sigSize).Signature()
sig, err := windowSeq.RunMinHash(kSize, sigSize)
misc.ErrorCheck(err)
// check if we have seen this signature for this node and offset
hashedSig := sigHasher(sig)
if _, ok := sigChecker[hashedSig]; !ok {
Expand Down
44 changes: 7 additions & 37 deletions src/lshIndex/lshEnsemble.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package lshIndex

import (
"encoding/gob"
"fmt"
"os"
"io/ioutil"
"sync"

"gopkg.in/vmihailenco/msgpack.v2"
"github.com/orcaman/concurrent-map"
"github.com/will-rowe/groot/src/seqio"
)
Expand Down Expand Up @@ -40,10 +40,10 @@ type LshEnsemble struct {
Lshes []*LshForest
MaxK int
NumHash int
paramCache cmap.ConcurrentMap
Indexed bool
SingleForest bool
KeyLookup KeyLookupMap
paramCache cmap.ConcurrentMap
}

// Add a new domain to the index given its partition ID - the index of the partition.
Expand Down Expand Up @@ -137,50 +137,20 @@ func (LshEnsemble *LshEnsemble) Dump(path string) error {
if LshEnsemble.Indexed == true {
return fmt.Errorf("cannot dump the LSH Index after running the indexing method")
}
file, err := os.Create(path)
b, err := msgpack.Marshal(LshEnsemble)
if err != nil {
return err
}
defer file.Close()
encoder := gob.NewEncoder(file)
if err := encoder.Encode(LshEnsemble); err != nil {
return err
}
for _, lsh := range LshEnsemble.Lshes {
for _, bandContents := range lsh.initHashTables {
err := encoder.Encode(bandContents)
if err != nil {
return err
}
}
}
err = encoder.Encode(LshEnsemble.KeyLookup)
if err != nil {
return err
}
return nil
return ioutil.WriteFile(path, b, 0644)
}

// Load an LSH index from disk
func (LshEnsemble *LshEnsemble) Load(path string) error {
file, err := os.Open(path)
b, err := ioutil.ReadFile(path)
if err != nil {
return err
}
defer file.Close()
decoder := gob.NewDecoder(file)
if err := decoder.Decode(&LshEnsemble); err != nil {
return(err)
}
for _, lsh := range LshEnsemble.Lshes {
for _, bandContents := range lsh.initHashTables {
err := decoder.Decode(&bandContents)
if err != nil {
return err
}
}
}
err = decoder.Decode(&LshEnsemble.KeyLookup)
err = msgpack.Unmarshal(b, LshEnsemble)
if err != nil {
return err
}
Expand Down
56 changes: 26 additions & 30 deletions src/lshIndex/lshForest.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@ func hashKeyFuncGen(hashValueSize int) hashKeyFunc {
// L (number of bands) and
// K (number of hash functions per band).
type LshForest struct {
k int
l int
initHashTables []initHashTable
hashTables []hashTable
K int
L int
InitHashTables []initHashTable
HashTables []hashTable
hashKeyFunc hashKeyFunc
hashValueSize int
KeyLookup KeyLookupMap
}

//
Expand All @@ -68,32 +66,30 @@ func newLshForest(k, l int) *LshForest {
initHashTables[i] = make(initHashTable)
}
return &LshForest{
k: k,
l: l,
hashValueSize: HASH_SIZE,
initHashTables: initHashTables,
hashTables: hashTables,
K: k,
L: l,
InitHashTables: initHashTables,
HashTables: hashTables,
hashKeyFunc: hashKeyFuncGen(HASH_SIZE),
KeyLookup: make(KeyLookupMap),
}
}

// Returns the number of hash functions per band and the number of bands
func (f *LshForest) Settings() (int, int) {
return f.k, f.l
return f.K, f.L
}

// Add a key with MinHash signature into the index.
// The key won't be searchable until Index() is called.
func (f *LshForest) Add(key interface{}, sig []uint64) {
// Generate hash keys
Hs := make([]string, f.l)
for i := 0; i < f.l; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.k : (i+1)*f.k])
Hs := make([]string, f.L)
for i := 0; i < f.L; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.K : (i+1)*f.K])
}
// Insert keys into the bootstrapping tables
for i := range f.initHashTables {
ht := f.initHashTables[i]
for i := range f.InitHashTables {
ht := f.InitHashTables[i]
hk := Hs[i]
if _, exist := ht[hk]; exist {
ht[hk] = append(ht[hk], key)
Expand All @@ -106,39 +102,39 @@ func (f *LshForest) Add(key interface{}, sig []uint64) {

// Index makes all the keys added searchable.
func (f *LshForest) Index() {
for i := range f.hashTables {
ht := make(hashTable, 0, len(f.initHashTables[i]))
for i := range f.HashTables {
ht := make(hashTable, 0, len(f.InitHashTables[i]))
// Build sorted hash table using buckets from init hash tables
for hashKey, keys := range f.initHashTables[i] {
for hashKey, keys := range f.InitHashTables[i] {
ht = append(ht, bucket{
hashKey: hashKey,
keys: keys,
})
}
sort.Sort(ht)
f.hashTables[i] = ht
f.HashTables[i] = ht
// Reset the init hash tables
f.initHashTables[i] = make(initHashTable)
f.InitHashTables[i] = make(initHashTable)
}
}

// Query returns candidate keys given the query signature and parameters.
func (f *LshForest) Query(sig []uint64, K, L int, out chan<- interface{}, done <-chan struct{}) {
if K == -1 {
K = f.k
K = f.K
}
if L == -1 {
L = f.l
L = f.L
}
prefixSize := f.hashValueSize * K
prefixSize := HASH_SIZE * K
// Generate hash keys
Hs := make([]string, L)
for i := 0; i < L; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.k : i*f.k+K])
Hs[i] = f.hashKeyFunc(sig[i*f.K : i*f.K+K])
}
seens := make(map[interface{}]bool)
for i := 0; i < L; i++ {
ht := f.hashTables[i]
ht := f.HashTables[i]
hk := Hs[i]
k := sort.Search(len(ht), func(x int) bool {
return ht[x].hashKey[:prefixSize] >= hk
Expand Down Expand Up @@ -167,8 +163,8 @@ func (f *LshForest) Query(sig []uint64, K, L int, out chan<- interface{}, done <
// and t is the containment threshold.
func (f *LshForest) OptimalKL(x, q int, t float64) (optK, optL int, fp, fn float64) {
minError := math.MaxFloat64
for l := 1; l <= f.l; l++ {
for k := 1; k <= f.k; k++ {
for l := 1; l <= f.L; l++ {
for k := 1; k <= f.K; k++ {
currFp := probFalsePositiveC(x, q, l, k, t, PRECISION)
currFn := probFalseNegativeC(x, q, l, k, t, PRECISION)
currErr := currFn + currFp
Expand Down
2 changes: 1 addition & 1 deletion src/lshIndex/lshIndex.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const HASH_SIZE = 8
// integration precision for optimising number of bands + hash functions in LSH Forest
const PRECISION = 0.01
// number of partitions and maxK to use in LSH Ensemble (TODO: add these as customisable parameters for GROOT)
const PARTITIONS = 10
const PARTITIONS = 6
const MAXK = 4

// error messages
Expand Down
85 changes: 46 additions & 39 deletions src/minhash/minhash.go
Original file line number Diff line number Diff line change
@@ -1,68 +1,75 @@
// the minhash package contains a MinHash implementation that is adapted from go-minhash (https://godoc.org/github.com/dgryski/go-minhash)
// the minhash package contains a minHash implementation that uses the ntHash rolling hash function
package minhash

import (
"errors"
"math"

"github.com/will-rowe/ntHash"
)

// if set true, ntHash will return the canonical k-mer (inspects both strands for each k-mer and returns the lowest hash value)
const CANONICAL = false

/*
The MinHash struct contains all the minimum hash values for a sequence
The minHash struct contains all the minimum hash values for a sequence
*/
type MinHash struct {
type minHash struct {
kSize int
signature []uint64
hashFunc1 Hash64
hashFunc2 Hash64
}
type Hash64 func([]byte) uint64

/*
A method to hash a k-mer and add signature to the MinHash struct
*/
func (self *MinHash) Add(kmer []byte) {
hashValue1, hashValue2 := self.hashFunc1(kmer), self.hashFunc2(kmer)
for i, minVal := range self.signature {
newVal := hashValue1 + uint64(i)*hashValue2
if newVal < minVal {
self.signature[i] = newVal
// Add a sequence to the minHash
func (minHash *minHash) Add(sequence []byte) error {
// initiate the rolling hash
hasher, err := ntHash.New(&sequence, minHash.kSize)
if err != nil {
return err
}
}
// get hashed kmers from read
for hash := range hasher.Hash(CANONICAL) {
// for each hashed k-mer, try adding it to the sketch
for i, minVal := range minHash.signature {
// split the hashed k-mer (uint64) into two uint32
h1, h2 := uint32(hash), uint32(hash>>32)
// get the new hash value for this signature position
newVal := uint64(h1 + uint32(i)*h2)
// evaluate and add to the signature if it is a minimum
if newVal < minVal {
minHash.signature[i] = newVal
}
}
}
return nil
}

/*
A method to dump the MinHash signature
*/
func (self *MinHash) Signature() []uint64 {
return self.signature
// Signature returns the current minHash signature (set of minimums)
func (minHash *minHash) Signature() []uint64 {
return minHash.signature
}

/*
A method to estimate Jaccard Similarity between a MinHash struct and a query MH signature
*/
func (self *MinHash) Similarity(querySig []uint64) (float64, error) {
if len(self.signature) != len(querySig) {
// Similarity is a method to estimate the Jaccard Similarity using to minHash signatures
func (minHash *minHash) Similarity(querySig []uint64) (float64, error) {
if len(minHash.signature) != len(querySig) {
return 0, errors.New("length of minhash signatures do not match\n")
}
intersect := 0
for i := range self.signature {
if self.signature[i] == querySig[i] {
for i := range minHash.signature {
if minHash.signature[i] == querySig[i] {
intersect++
}
}
return float64(intersect) / float64(len(self.signature)), nil
return float64(intersect) / float64(len(minHash.signature)), nil
}

/*
A function to create a new MinHash struct
*/
func NewMinHash(h1, h2 Hash64, size int) *MinHash {
signature := make([]uint64, size)
// NewminHash initiates a minHash struct and populates the signature with max values
func NewMinHash(kSize, sigSize int) *minHash {
signature := make([]uint64, sigSize)
for i := range signature {
signature[i] = math.MaxUint64
}
newMinHash := new(MinHash)
newMinHash.hashFunc1 = h1
newMinHash.hashFunc2 = h2
newMinHash.signature = signature
return newMinHash
return &minHash{
kSize: kSize,
signature: signature,
}
}
Loading