Merge pull request #156 from ashvardanian/main-dev

Bug fixes in C/C++ and new Rust functionality
ashvardanian · Aug 5, 2024 · 279106b · 279106b
2 parents 91d0a1a + 92d7ca4
commit 279106b
Show file tree

Hide file tree

Showing 11 changed files with 778 additions and 98 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -18,6 +18,32 @@ permissions:
   contents: read
 
 jobs:
+  versioning:
+    name: Update Version
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+      - name: Run TinySemVer
+        uses: ashvardanian/tinysemver@v2.0.1
+        with:
+          verbose: "true"
+          version-file: "VERSION"
+          update-version-in: |
+            Cargo.toml:^version = "(\d+\.\d+\.\d+)"
+            package.json:"version": "(\d+\.\d+\.\d+)"
+            CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
+          update-major-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
+          update-minor-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
+          update-patch-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
+          dry-run: "true"
+
   test_ubuntu_gcc:
     name: Ubuntu (GCC 12)
     runs-on: ubuntu-22.04
@@ -230,7 +256,7 @@ jobs:
           wget https://apt.llvm.org/llvm.sh
           chmod +x llvm.sh
           sudo ./llvm.sh 16
-        
+
       - name: Build C/C++
         run: |
           cmake -B build_artifacts \

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -18,22 +18,38 @@ permissions:
 
 jobs:
   versioning:
-    name: Semantic Release
-    runs-on: ubuntu-22.04
+    name: Update Version
+    runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
         with:
+          fetch-depth: 0
           persist-credentials: false
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
       - name: Set up Cargo
         uses: actions-rs/toolchain@v1
         with:
           toolchain: stable
           override: true
-      - run: npm install --ignore-scripts --save-dev --prefix ./package-ci @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx --prefix ./package-ci semantic-release
+      - name: Run TinySemVer
+        uses: ashvardanian/tinysemver@v2.0.1
+        with:
+          verbose: "true"
+          version-file: "VERSION"
+          update-version-in: |
+            Cargo.toml:^version = "(\d+\.\d+\.\d+)"
+            package.json:"version": "(\d+\.\d+\.\d+)"
+            CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
+          update-major-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
+          update-minor-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
+          update-patch-version-in: |
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
+          dry-run: "false"
+          push: "true"
+          create-release: "true"
+          github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
 
   rebase:
     name: Rebase Dev. Branch

diff --git a/.github/workflows/update_version.sh b/.github/workflows/update_version.sh
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -56,6 +56,10 @@ unzip enwik9.zip && rm enwik9.zip && mv enwik9 enwik9.txt
 # 4.7 GB (1.7 GB compressed), 1'004'598 lines of UTF8, 268'435'456 tokens of mean length 8
 wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz
 gzip -d xlsum.csv.gz
+
+# Human chromosome generator dataset generated by https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
+# 1200 rows, each 800 characters long (939K)
+wget --no-clobber -O human_protein_1200row_800len.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt
 ```
 
 ## IDE Integrations

diff --git a/cli/wc.py b/cli/wc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-import sys
+import sys, os
 import argparse
 import stringzilla
 from stringzilla import File, Str
@@ -30,6 +30,13 @@ def parse_arguments():
     parser.add_argument(
         "-w", "--words", action="store_true", help="print the word counts"
     )
+    parser.add_argument(
+        "--files0-from",
+        metavar="filename",
+        help="Read input from the files specified by NUL-terminated names in file F;"
+        " If F is - then read names from standard input",
+    )
+
     parser.add_argument("--version", action="version", version=stringzilla.__version__)
     return parser.parse_args()
 
@@ -45,27 +52,28 @@ def wc(file_path, args):
         except RuntimeError:  # File gives a RuntimeError if the file does not exist
             return f"No such file: {file_path}", False
 
-    line_count = mapped_bytes.count("\n")
-    word_count = mapped_bytes.count(" ") + 1
-    char_count = mapped_bytes.__len__()
-    counts = {
-        "line_count": line_count,
-        "word_count": word_count,
-        "char_count": char_count,
-    }
+    counts = {}
+    if args.lines:
+        counts["line_count"] = mapped_bytes.count("\n")
+    if args.words:
+        counts["word_count"] = mapped_bytes.count(" ") + 1
+    if args.chars:
+        counts["char_count"] = mapped_bytes.__len__()
 
     if args.max_line_length:
-        max_line_length = max(len(line) for line in str(mapped_bytes).split("\n"))
+        max_line_length = max(len(line) for line in mapped_bytes.split("\n"))
         counts["max_line_length"] = max_line_length
 
-    if args.bytes or args.chars:
-        byte_count = char_count  # assume 1 char = 1 byte
-        counts["byte_count"] = byte_count
+    if args.bytes:
+        if args.chars:
+            counts["byte_count"] = counts["char_count"]
+        else:
+            counts["byte_count"] = mapped_bytes.__len__()
 
     return counts, True
 
 
-def format_output(counts, args):
+def format_output(counts, args, just):
     selected_counts = []
     if args.lines:
         selected_counts.append(counts["line_count"])
@@ -74,18 +82,18 @@ def format_output(counts, args):
     if args.chars:
         selected_counts.append(counts["char_count"])
     if args.bytes:
-        selected_counts.append(counts.get("byte_count", counts["char_count"]))
+        selected_counts.append(counts["byte_count"])
     if args.max_line_length:
         selected_counts.append(counts.get("max_line_length", 0))
 
-    if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
-        selected_counts = [
-            counts["line_count"],
-            counts["word_count"],
-            counts["char_count"],
-        ]
+    return " ".join(str(count).rjust(just) for count in selected_counts)
+
 
-    return " ".join(str(count) for count in selected_counts)
+def get_files_from(fn):
+    f = open(fn, "r")
+    s = f.read()
+    f.close()
+    return [x for x in s.split("\0") if os.path.isfile(x)]
 
 
 def main():
@@ -97,19 +105,33 @@ def main():
         "max_line_length": 0,
         "byte_count": 0,
     }
+    if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
+        args.lines = 1
+        args.words = 1
+        args.bytes = 1
+
+    # wc uses the file size to determine column width when printing
+    if args.files0_from:
+        if args.files[0] == "-":
+            args.files = get_files_from(args.files0_from)
+            if len(args.files) == 0:
+                # print("  No filenames found in ", args.files0_from)
+                exit(0)
+
+    just = max(len(str(os.stat(fn).st_size)) for fn in args.files)
 
     for file_path in args.files:
         counts, success = wc(file_path, args)
         if success:
             for key in total_counts.keys():
                 total_counts[key] += counts.get(key, 0)
-            output = format_output(counts, args) + f" {file_path}"
+            output = format_output(counts, args, just) + f" {file_path}"
             print(output)
         else:
             print(counts)
 
     if len(args.files) > 1:
-        total_output = format_output(total_counts, args) + " total"
+        total_output = format_output(total_counts, args, just) + " total"
         print(total_output)
 
 

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
@@ -2194,7 +2194,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_c
 
         // Verify the remaining part of the needle
         sz_size_t remaining = h_length - (found - h);
-        if (remaining < suffix_length) return SZ_NULL_CHAR;
+        if (remaining < n_length) return SZ_NULL_CHAR;
         if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
 
         // Adjust the position.
@@ -2246,7 +2246,6 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_si
 }
 
 SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
@@ -2620,11 +2619,9 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     // Skip the matching prefixes and suffixes, they won't affect the distance.
     for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
          longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length)
-        ;
+         ++longer, ++shorter, --longer_length, --shorter_length);
     for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length)
-        ;
+         --longer_length, --shorter_length);
 
     // Bounded computations may exit early.
     if (bound) {