diff --git a/tests/ucl.nim b/tests/ucl.nim new file mode 100644 index 0000000..5e4a55e --- /dev/null +++ b/tests/ucl.nim @@ -0,0 +1,56 @@ +when not declared(stdin): import std/[syncio, formatfloat] +import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats + +const bLen {.intdefine.} = 10 # <1024 long; RT limits nicer but harder +const bOff {.intdefine.} = 22 # <4MiB UNIQUE line data +type + Count {.packed.} = object # Dense-ish hash Count type + when defined hashCache: hc: uint32 # 4B|8B per cell + len {.bitsize: bLen.}: uint32 + off {.bitsize: bOff.}: uint32 + Counts = object + dat: seq[Count] + nUsed: int + +var a = " "; oatKStack a, Counts, Count, off,uint32, MSlice, MSlice +#proc key(c: var Counts, i: int, q: MSlice) = c.dat[i]=c.keyR(q) wrong&unneeded +proc key(c: Counts, i: int): MSlice = c.dat[i].key +proc used(c: Counts, i: int): bool = c.dat[i].off!=0 + +when defined hashCache: # def auto-triggers use + proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash + proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 + proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash + +oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable +when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} + +proc incFailed(h: var Counts, r: MSlice): bool = + if r.len + 1 > 1 shl bLen: # Careful to not overflow + erru "skipping too long(", $r.len, ") line: ",$r,"\n" + return # Cannot go on LOCALLY + h.upSert(r, i): discard # Found key @i: nothing to do + do: # Novel key->i: + h.dat[i].off = a.add(r, (1 shl bOff) - 1): + erru "unique word data overflow at:",$r,"\n" #XXX rate limit msgs + return true # Cannot go on GLOBALLY + h.dat[i].len = r.len.uint32 # Init + +proc ucl(size=9999, dSize=81920, tm=false) = + ## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data. + let t0 = if tm: epochTime() else: 0.0 + var h: Counts; h.setCap size # Pre-size table & data + a.setLen dSize; a.setLen 1 + var nTot = 0 + block IO: + for (line, nLine) in stdin.getDelims: + let ms = MSlice(mem: line, len: nLine - 1) + inc nTot # Always bump `nTotal` + if h.incFailed(ms): break IO + echo h.len," unique ",nTot," total ",a.len," B" + if tm: stderr.write epochTime() - t0, "\n" + +when isMainModule: dispatch ucl, help={ + "size" : "pre-size hash table for size slots", + "dSize": "pre-size str data area to this many bytes", + "tm" : "emit wall time of counting to stderr & quit"} diff --git a/tests/wfr.nim b/tests/wfr.nim index e9faea0..92c8f21 100644 --- a/tests/wfr.nim +++ b/tests/wfr.nim @@ -14,23 +14,23 @@ type dat: seq[Count] nUsed: int -var s: string; s.keyStack off,uint32, Count,MSlice +var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice proc key(c: Counts, i: int): MSlice = c.dat[i].key +proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v proc val(c: Counts, i: int): uint32 = c.dat[i].cnt proc used(c: Counts, i: int): bool = c.dat[i].len != 0 when defined hashCache: # 2nd def triggers saving lpt behavior + proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash + proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash - proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc -else: - proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash - proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard -Counts.useCountedCellSeq dat, nUsed +oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable +when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} proc incFailed(h: var Counts, ms: MSlice): bool = if ms.len > (1 shl bLen) - 1: # Careful to not overflow erru "skipping too long word: ",$ms,"\n" return # Cannot go on LOCALLY - h.getPut(i, ms, hc): # Found key @i: + h.upSert(ms, i): # Found key @i: if h.dat[i].cnt == (1 shl bCnt) - 1: erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit else: h.dat[i].cnt.inc # bump diff --git a/tests/wu.nim b/tests/wu.nim index 2822424..b7b1a28 100644 --- a/tests/wu.nim +++ b/tests/wu.nim @@ -12,23 +12,21 @@ type dat: seq[Count] nUsed: int -var s: string; s.keyStack off,uint32, Count,MSlice +var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice proc key(c: Counts, i: int): MSlice = c.dat[i].key -proc val(c: Counts, i: int): Void {.used.} = discard #NONE proc used(c: Counts, i: int): bool = c.dat[i].len != 0 when defined hashCache: # 2nd def triggers saving lpt behavior + proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash + proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash - proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc -else: - proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash - proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard -Counts.useCountedCellSeq dat, nUsed +oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable +when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} proc incFailed(h: var Counts, ms: MSlice): bool = if ms.len > (1 shl bLen) - 1: # Careful to not overflow erru "skipping too long word: ",$ms,"\n" return # Cannot go on LOCALLY - h.getPut(i, ms, hc): discard # Found key @i: + h.upSert(ms, i): discard # Found key @i: do: # Novel key->i: h.dat[i].off = s.add(ms, (1 shl bOff) - 1): erru "unique word data overflow at:",$ms,"\n" #XXX rate limit diff --git a/util/lfreq.nim b/util/lfreq.nim index 8f163f0..160d7fd 100644 --- a/util/lfreq.nim +++ b/util/lfreq.nim @@ -14,23 +14,23 @@ type dat: seq[Count] nUsed: int -var s: string; s.keyStack off,uint32, Count,MSlice +var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice proc key(c: Counts, i: int): MSlice = c.dat[i].key +proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v proc val(c: Counts, i: int): uint32 = c.dat[i].cnt proc used(c: Counts, i: int): bool = c.dat[i].cnt != 0 when defined hashCache: # 2nd def triggers saving lpt behavior + proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash + proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash - proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc -else: - proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash - proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard -Counts.useCountedCellSeq dat, nUsed +oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable +when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} proc incFailed(h: var Counts, ms: MSlice): bool = if ms.len > (1 shl bLen) - 1: # Careful to not overflow erru "skipping too long line: ", ($ms)[0..<128], "\n" - return # Cannot go on LOCALLY - h.getPut(i, ms, hc): # Found key @i: + return false # Cannot go on LOCALLY + h.upSert(ms, i): # Found key @i: if h.dat[i].cnt == (1 shl bCnt) - 1: erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit else: h.dat[i].cnt.inc # bump