Small changes to adapt old tests/util code to new adix/oats. Also add

`tests/ucl.nim`. All seems to work fine. A new example program is in the works to exercise an MFile-backed VOat.
c-blake · Dec 22, 2023 · fcdad32 · fcdad32
1 parent 70a2a3d
commit fcdad32
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 23 deletions.
diff --git a/tests/ucl.nim b/tests/ucl.nim
@@ -0,0 +1,56 @@
+when not declared(stdin): import std/[syncio, formatfloat]
+import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats
+
+const bLen {.intdefine.} = 10   # <1024 long; RT limits nicer but harder
+const bOff {.intdefine.} = 22   # <4MiB UNIQUE line data
+type
+  Count {.packed.} = object     # Dense-ish hash Count type
+    when defined hashCache: hc: uint32 # 4B|8B per cell
+    len {.bitsize: bLen.}: uint32
+    off {.bitsize: bOff.}: uint32
+  Counts = object
+    dat: seq[Count]
+    nUsed: int
+
+var a = " "; oatKStack a, Counts, Count, off,uint32, MSlice, MSlice
+#proc key(c: var Counts, i: int, q: MSlice) = c.dat[i]=c.keyR(q) wrong&unneeded
+proc key(c: Counts, i: int): MSlice = c.dat[i].key
+proc used(c: Counts, i: int): bool = c.dat[i].off!=0
+
+when defined hashCache:                           # def auto-triggers use
+  proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
+  proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
+  proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
+
+oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
+when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}
+
+proc incFailed(h: var Counts, r: MSlice): bool =
+  if r.len + 1 > 1 shl bLen:    # Careful to not overflow
+    erru "skipping too long(", $r.len, ") line: ",$r,"\n"
+    return                      # Cannot go on LOCALLY
+  h.upSert(r, i): discard       # Found key @i: nothing to do
+  do:                           # Novel key->i:
+    h.dat[i].off = a.add(r, (1 shl bOff) - 1):
+      erru "unique word data overflow at:",$r,"\n" #XXX rate limit msgs
+      return true               # Cannot go on GLOBALLY
+    h.dat[i].len = r.len.uint32 # Init
+
+proc ucl(size=9999, dSize=81920, tm=false) =
+  ## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data.
+  let t0 = if tm: epochTime() else: 0.0
+  var h: Counts; h.setCap size  # Pre-size table & data
+  a.setLen dSize; a.setLen 1
+  var nTot = 0
+  block IO:
+    for (line, nLine) in stdin.getDelims:
+      let ms = MSlice(mem: line, len: nLine - 1)
+      inc nTot                  # Always bump `nTotal`
+      if h.incFailed(ms): break IO
+  echo h.len," unique ",nTot," total ",a.len," B"
+  if tm: stderr.write epochTime() - t0, "\n"
+
+when isMainModule: dispatch ucl, help={
+  "size" : "pre-size hash table for size slots",
+  "dSize": "pre-size str data area to this many bytes",
+  "tm"   : "emit wall time of counting to stderr & quit"}
diff --git a/tests/wfr.nim b/tests/wfr.nim
@@ -14,23 +14,23 @@ type
     dat: seq[Count]
     nUsed: int
 
-var s: string; s.keyStack off,uint32, Count,MSlice
+var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
 proc key(c: Counts, i: int): MSlice = c.dat[i].key
+proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v
 proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
 proc used(c: Counts, i: int): bool = c.dat[i].len != 0
 when defined hashCache:         # 2nd def triggers saving lpt behavior
+  proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
+  proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
-  proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
-else:
-  proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
-  proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
-Counts.useCountedCellSeq dat, nUsed
+oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
+when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}
 
 proc incFailed(h: var Counts, ms: MSlice): bool =
   if ms.len > (1 shl bLen) - 1: # Careful to not overflow
     erru "skipping too long word: ",$ms,"\n"
     return                      # Cannot go on LOCALLY
-  h.getPut(i, ms, hc):          # Found key @i:
+  h.upSert(ms, i):              # Found key @i:
     if h.dat[i].cnt == (1 shl bCnt) - 1:
       erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
     else: h.dat[i].cnt.inc      #   bump

diff --git a/tests/wu.nim b/tests/wu.nim
@@ -12,23 +12,21 @@ type
     dat: seq[Count]
     nUsed: int
 
-var s: string; s.keyStack off,uint32, Count,MSlice
+var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
 proc key(c: Counts, i: int): MSlice = c.dat[i].key
-proc val(c: Counts, i: int): Void {.used.} = discard #NONE
 proc used(c: Counts, i: int): bool = c.dat[i].len != 0
 when defined hashCache:         # 2nd def triggers saving lpt behavior
+  proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
+  proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
-  proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
-else:
-  proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
-  proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
-Counts.useCountedCellSeq dat, nUsed
+oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
+when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}
 
 proc incFailed(h: var Counts, ms: MSlice): bool =
   if ms.len > (1 shl bLen) - 1: # Careful to not overflow
     erru "skipping too long word: ",$ms,"\n"
     return                      # Cannot go on LOCALLY
-  h.getPut(i, ms, hc): discard  # Found key @i:
+  h.upSert(ms, i): discard      # Found key @i:
   do:                           # Novel key->i:
     h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
       erru "unique word data overflow at:",$ms,"\n" #XXX rate limit

diff --git a/util/lfreq.nim b/util/lfreq.nim
@@ -14,23 +14,23 @@ type
     dat: seq[Count]
     nUsed: int
 
-var s: string; s.keyStack off,uint32, Count,MSlice
+var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
 proc key(c: Counts, i: int): MSlice = c.dat[i].key
+proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v
 proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
 proc used(c: Counts, i: int): bool = c.dat[i].cnt != 0
 when defined hashCache:         # 2nd def triggers saving lpt behavior
+  proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
+  proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
-  proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
-else:
-  proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
-  proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
-Counts.useCountedCellSeq dat, nUsed
+oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
+when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}
 
 proc incFailed(h: var Counts, ms: MSlice): bool =
   if ms.len > (1 shl bLen) - 1: # Careful to not overflow
     erru "skipping too long line: ", ($ms)[0..<128], "\n"
-    return                      # Cannot go on LOCALLY
-  h.getPut(i, ms, hc):          # Found key @i:
+    return false                # Cannot go on LOCALLY
+  h.upSert(ms, i):              # Found key @i:
     if h.dat[i].cnt == (1 shl bCnt) - 1:
       erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
     else: h.dat[i].cnt.inc      #   bump