Skip to content

Commit

Permalink
Merge pull request #12 from will-rowe/dev
Browse files Browse the repository at this point in the history
Merge dev into master
  • Loading branch information
Will Rowe authored Oct 16, 2018
2 parents 2f68050 + 4a10f3f commit 23f8d71
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 129 deletions.
4 changes: 2 additions & 2 deletions cmd/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ func alignParamCheck() error {
func runAlign() {
// set up profiling
if *profiling == true {
//defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
defer profile.Start(profile.ProfilePath("./")).Stop()
defer profile.Start(profile.MemProfile, profile.ProfilePath("./")).Stop()
//defer profile.Start(profile.ProfilePath("./")).Stop()
}
// start logging
logFH := misc.StartLogging(*logFile)
Expand Down
14 changes: 9 additions & 5 deletions cmd/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ var (
outDir *string // directory to save index files and log to
defaultOutDir = "./groot-index-" + string(time.Now().Format("20060102150405")) // a default dir to store the index files
containment *bool // use lshEnsemble instead of lshForest -- allows for variable read length
maxK *int // the maxK for LSH Ensemble (only active for --containment)
numPart *int // the number of partitions for LSH Ensemble (only active for --containment)
)

// the index command (used by cobra)
Expand All @@ -69,13 +71,15 @@ var indexCmd = &cobra.Command{

// a function to initialise the command line arguments
func init() {
kSize = indexCmd.Flags().IntP("kmerSize", "k", 7, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 128, "size of MinHash signature")
kSize = indexCmd.Flags().IntP("kmerSize", "k", 31, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 42, "size of MinHash signature")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "max length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded (note: this is used as a containment theshold when --containment set")
msaDir = indexCmd.Flags().StringP("msaDir", "i", "", "directory containing the clustered references (MSA files) - required")
outDir = indexCmd.PersistentFlags().StringP("outDir", "o", defaultOutDir, "directory to save index files to")
containment = indexCmd.Flags().BoolP("containment", "c", false, "use lshEnsemble instead of lshForest (allows for variable read length during alignment)")
maxK = indexCmd.Flags().IntP("maxK", "m", 4, "maxK in LSH Ensemble (only active with --containment)")
numPart = indexCmd.Flags().IntP("numPart", "n", 4, "num. partitions in LSH Ensemble (only active with --containment)")
indexCmd.MarkFlagRequired("msaDir")
RootCmd.AddCommand(indexCmd)
}
Expand Down Expand Up @@ -239,10 +243,10 @@ func runIndex() {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH ensemble (https://github.com/ekzhu/lshensemble)
log.Printf("running LSH Ensemble...\n")
database = lshIndex.BootstrapLshEnsemble(lshIndex.PARTITIONS, *sigSize, lshIndex.MAXK, numSigs, lshIndex.Windows2Chan(sigStore))
database = lshIndex.BootstrapLshEnsemble(*numPart, *sigSize, *maxK, numSigs, lshIndex.Windows2Chan(sigStore))
// print some stuff
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", lshIndex.PARTITIONS)
log.Printf("\tmax no. hash functions per bucket: %d\n", lshIndex.MAXK)
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", *numPart)
log.Printf("\tmax no. hash functions per bucket: %d\n", *maxK)
}
// attach the key lookup map to the index
database.KeyLookup = lookupMap
Expand Down
4 changes: 3 additions & 1 deletion src/graph/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/biogo/hts/sam"
"github.com/will-rowe/gfa"
"github.com/will-rowe/groot/src/seqio"
"github.com/will-rowe/groot/src/misc"
)

/*
Expand Down Expand Up @@ -250,7 +251,8 @@ func (graph *GrootGraph) WindowGraph(windowSize, kSize, sigSize int) <-chan *Win
for path := range sendPath {
// get a minhash signature
windowSeq := seqio.Sequence{Seq: path}
sig := windowSeq.RunMinHash(kSize, sigSize).Signature()
sig, err := windowSeq.RunMinHash(kSize, sigSize)
misc.ErrorCheck(err)
// check if we have seen this signature for this node and offset
hashedSig := sigHasher(sig)
if _, ok := sigChecker[hashedSig]; !ok {
Expand Down
44 changes: 7 additions & 37 deletions src/lshIndex/lshEnsemble.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package lshIndex

import (
"encoding/gob"
"fmt"
"os"
"io/ioutil"
"sync"

"gopkg.in/vmihailenco/msgpack.v2"
"github.com/orcaman/concurrent-map"
"github.com/will-rowe/groot/src/seqio"
)
Expand Down Expand Up @@ -40,10 +40,10 @@ type LshEnsemble struct {
Lshes []*LshForest
MaxK int
NumHash int
paramCache cmap.ConcurrentMap
Indexed bool
SingleForest bool
KeyLookup KeyLookupMap
paramCache cmap.ConcurrentMap
}

// Add a new domain to the index given its partition ID - the index of the partition.
Expand Down Expand Up @@ -137,50 +137,20 @@ func (LshEnsemble *LshEnsemble) Dump(path string) error {
if LshEnsemble.Indexed == true {
return fmt.Errorf("cannot dump the LSH Index after running the indexing method")
}
file, err := os.Create(path)
b, err := msgpack.Marshal(LshEnsemble)
if err != nil {
return err
}
defer file.Close()
encoder := gob.NewEncoder(file)
if err := encoder.Encode(LshEnsemble); err != nil {
return err
}
for _, lsh := range LshEnsemble.Lshes {
for _, bandContents := range lsh.initHashTables {
err := encoder.Encode(bandContents)
if err != nil {
return err
}
}
}
err = encoder.Encode(LshEnsemble.KeyLookup)
if err != nil {
return err
}
return nil
return ioutil.WriteFile(path, b, 0644)
}

// Load an LSH index from disk
func (LshEnsemble *LshEnsemble) Load(path string) error {
file, err := os.Open(path)
b, err := ioutil.ReadFile(path)
if err != nil {
return err
}
defer file.Close()
decoder := gob.NewDecoder(file)
if err := decoder.Decode(&LshEnsemble); err != nil {
return(err)
}
for _, lsh := range LshEnsemble.Lshes {
for _, bandContents := range lsh.initHashTables {
err := decoder.Decode(&bandContents)
if err != nil {
return err
}
}
}
err = decoder.Decode(&LshEnsemble.KeyLookup)
err = msgpack.Unmarshal(b, LshEnsemble)
if err != nil {
return err
}
Expand Down
56 changes: 26 additions & 30 deletions src/lshIndex/lshForest.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@ func hashKeyFuncGen(hashValueSize int) hashKeyFunc {
// L (number of bands) and
// K (number of hash functions per band).
type LshForest struct {
k int
l int
initHashTables []initHashTable
hashTables []hashTable
K int
L int
InitHashTables []initHashTable
HashTables []hashTable
hashKeyFunc hashKeyFunc
hashValueSize int
KeyLookup KeyLookupMap
}

//
Expand All @@ -68,32 +66,30 @@ func newLshForest(k, l int) *LshForest {
initHashTables[i] = make(initHashTable)
}
return &LshForest{
k: k,
l: l,
hashValueSize: HASH_SIZE,
initHashTables: initHashTables,
hashTables: hashTables,
K: k,
L: l,
InitHashTables: initHashTables,
HashTables: hashTables,
hashKeyFunc: hashKeyFuncGen(HASH_SIZE),
KeyLookup: make(KeyLookupMap),
}
}

// Returns the number of hash functions per band and the number of bands
func (f *LshForest) Settings() (int, int) {
return f.k, f.l
return f.K, f.L
}

// Add a key with MinHash signature into the index.
// The key won't be searchable until Index() is called.
func (f *LshForest) Add(key interface{}, sig []uint64) {
// Generate hash keys
Hs := make([]string, f.l)
for i := 0; i < f.l; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.k : (i+1)*f.k])
Hs := make([]string, f.L)
for i := 0; i < f.L; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.K : (i+1)*f.K])
}
// Insert keys into the bootstrapping tables
for i := range f.initHashTables {
ht := f.initHashTables[i]
for i := range f.InitHashTables {
ht := f.InitHashTables[i]
hk := Hs[i]
if _, exist := ht[hk]; exist {
ht[hk] = append(ht[hk], key)
Expand All @@ -106,39 +102,39 @@ func (f *LshForest) Add(key interface{}, sig []uint64) {

// Index makes all the keys added searchable.
func (f *LshForest) Index() {
for i := range f.hashTables {
ht := make(hashTable, 0, len(f.initHashTables[i]))
for i := range f.HashTables {
ht := make(hashTable, 0, len(f.InitHashTables[i]))
// Build sorted hash table using buckets from init hash tables
for hashKey, keys := range f.initHashTables[i] {
for hashKey, keys := range f.InitHashTables[i] {
ht = append(ht, bucket{
hashKey: hashKey,
keys: keys,
})
}
sort.Sort(ht)
f.hashTables[i] = ht
f.HashTables[i] = ht
// Reset the init hash tables
f.initHashTables[i] = make(initHashTable)
f.InitHashTables[i] = make(initHashTable)
}
}

// Query returns candidate keys given the query signature and parameters.
func (f *LshForest) Query(sig []uint64, K, L int, out chan<- interface{}, done <-chan struct{}) {
if K == -1 {
K = f.k
K = f.K
}
if L == -1 {
L = f.l
L = f.L
}
prefixSize := f.hashValueSize * K
prefixSize := HASH_SIZE * K
// Generate hash keys
Hs := make([]string, L)
for i := 0; i < L; i++ {
Hs[i] = f.hashKeyFunc(sig[i*f.k : i*f.k+K])
Hs[i] = f.hashKeyFunc(sig[i*f.K : i*f.K+K])
}
seens := make(map[interface{}]bool)
for i := 0; i < L; i++ {
ht := f.hashTables[i]
ht := f.HashTables[i]
hk := Hs[i]
k := sort.Search(len(ht), func(x int) bool {
return ht[x].hashKey[:prefixSize] >= hk
Expand Down Expand Up @@ -167,8 +163,8 @@ func (f *LshForest) Query(sig []uint64, K, L int, out chan<- interface{}, done <
// and t is the containment threshold.
func (f *LshForest) OptimalKL(x, q int, t float64) (optK, optL int, fp, fn float64) {
minError := math.MaxFloat64
for l := 1; l <= f.l; l++ {
for k := 1; k <= f.k; k++ {
for l := 1; l <= f.L; l++ {
for k := 1; k <= f.K; k++ {
currFp := probFalsePositiveC(x, q, l, k, t, PRECISION)
currFn := probFalseNegativeC(x, q, l, k, t, PRECISION)
currErr := currFn + currFp
Expand Down
2 changes: 1 addition & 1 deletion src/lshIndex/lshIndex.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const HASH_SIZE = 8
// integration precision for optimising number of bands + hash functions in LSH Forest
const PRECISION = 0.01
// number of partitions and maxK to use in LSH Ensemble (TODO: add these as customisable parameters for GROOT)
const PARTITIONS = 10
const PARTITIONS = 6
const MAXK = 4

// error messages
Expand Down
85 changes: 46 additions & 39 deletions src/minhash/minhash.go
Original file line number Diff line number Diff line change
@@ -1,68 +1,75 @@
// the minhash package contains a MinHash implementation that is adapted from go-minhash (https://godoc.org/github.com/dgryski/go-minhash)
// the minhash package contains a minHash implementation that uses the ntHash rolling hash function
package minhash

import (
"errors"
"math"

"github.com/will-rowe/ntHash"
)

// if set true, ntHash will return the canonical k-mer (inspects both strands for each k-mer and returns the lowest hash value)
const CANONICAL = false

/*
The MinHash struct contains all the minimum hash values for a sequence
The minHash struct contains all the minimum hash values for a sequence
*/
type MinHash struct {
type minHash struct {
kSize int
signature []uint64
hashFunc1 Hash64
hashFunc2 Hash64
}
type Hash64 func([]byte) uint64

/*
A method to hash a k-mer and add signature to the MinHash struct
*/
func (self *MinHash) Add(kmer []byte) {
hashValue1, hashValue2 := self.hashFunc1(kmer), self.hashFunc2(kmer)
for i, minVal := range self.signature {
newVal := hashValue1 + uint64(i)*hashValue2
if newVal < minVal {
self.signature[i] = newVal
// Add a sequence to the minHash
func (minHash *minHash) Add(sequence []byte) error {
// initiate the rolling hash
hasher, err := ntHash.New(&sequence, minHash.kSize)
if err != nil {
return err
}
}
// get hashed kmers from read
for hash := range hasher.Hash(CANONICAL) {
// for each hashed k-mer, try adding it to the sketch
for i, minVal := range minHash.signature {
// split the hashed k-mer (uint64) into two uint32
h1, h2 := uint32(hash), uint32(hash>>32)
// get the new hash value for this signature position
newVal := uint64(h1 + uint32(i)*h2)
// evaluate and add to the signature if it is a minimum
if newVal < minVal {
minHash.signature[i] = newVal
}
}
}
return nil
}

/*
A method to dump the MinHash signature
*/
func (self *MinHash) Signature() []uint64 {
return self.signature
// Signature returns the current minHash signature (set of minimums)
func (minHash *minHash) Signature() []uint64 {
return minHash.signature
}

/*
A method to estimate Jaccard Similarity between a MinHash struct and a query MH signature
*/
func (self *MinHash) Similarity(querySig []uint64) (float64, error) {
if len(self.signature) != len(querySig) {
// Similarity is a method to estimate the Jaccard Similarity using to minHash signatures
func (minHash *minHash) Similarity(querySig []uint64) (float64, error) {
if len(minHash.signature) != len(querySig) {
return 0, errors.New("length of minhash signatures do not match\n")
}
intersect := 0
for i := range self.signature {
if self.signature[i] == querySig[i] {
for i := range minHash.signature {
if minHash.signature[i] == querySig[i] {
intersect++
}
}
return float64(intersect) / float64(len(self.signature)), nil
return float64(intersect) / float64(len(minHash.signature)), nil
}

/*
A function to create a new MinHash struct
*/
func NewMinHash(h1, h2 Hash64, size int) *MinHash {
signature := make([]uint64, size)
// NewminHash initiates a minHash struct and populates the signature with max values
func NewMinHash(kSize, sigSize int) *minHash {
signature := make([]uint64, sigSize)
for i := range signature {
signature[i] = math.MaxUint64
}
newMinHash := new(MinHash)
newMinHash.hashFunc1 = h1
newMinHash.hashFunc2 = h2
newMinHash.signature = signature
return newMinHash
return &minHash{
kSize: kSize,
signature: signature,
}
}
Loading

0 comments on commit 23f8d71

Please sign in to comment.