Skip to content

Commit

Permalink
Merge pull request #156 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
Bug fixes in C/C++ and new Rust functionality
  • Loading branch information
ashvardanian authored Aug 5, 2024
2 parents 91d0a1a + 92d7ca4 commit 279106b
Show file tree
Hide file tree
Showing 11 changed files with 778 additions and 98 deletions.
28 changes: 27 additions & 1 deletion .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,32 @@ permissions:
contents: read

jobs:
versioning:
name: Update Version
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
- name: Run TinySemVer
uses: ashvardanian/tinysemver@v2.0.1
with:
verbose: "true"
version-file: "VERSION"
update-version-in: |
Cargo.toml:^version = "(\d+\.\d+\.\d+)"
package.json:"version": "(\d+\.\d+\.\d+)"
CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
update-major-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
update-minor-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
update-patch-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
dry-run: "true"

test_ubuntu_gcc:
name: Ubuntu (GCC 12)
runs-on: ubuntu-22.04
Expand Down Expand Up @@ -230,7 +256,7 @@ jobs:
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 16
- name: Build C/C++
run: |
cmake -B build_artifacts \
Expand Down
32 changes: 24 additions & 8 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,38 @@ permissions:

jobs:
versioning:
name: Semantic Release
runs-on: ubuntu-22.04
name: Update Version
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: 20
- name: Set up Cargo
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- run: npm install --ignore-scripts --save-dev --prefix ./package-ci @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx --prefix ./package-ci semantic-release
- name: Run TinySemVer
uses: ashvardanian/tinysemver@v2.0.1
with:
verbose: "true"
version-file: "VERSION"
update-version-in: |
Cargo.toml:^version = "(\d+\.\d+\.\d+)"
package.json:"version": "(\d+\.\d+\.\d+)"
CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
update-major-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
update-minor-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
update-patch-version-in: |
include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
dry-run: "false"
push: "true"
create-release: "true"
github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}

rebase:
name: Rebase Dev. Branch
Expand Down
13 changes: 0 additions & 13 deletions .github/workflows/update_version.sh

This file was deleted.

4 changes: 4 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ unzip enwik9.zip && rm enwik9.zip && mv enwik9 enwik9.txt
# 4.7 GB (1.7 GB compressed), 1'004'598 lines of UTF8, 268'435'456 tokens of mean length 8
wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz
gzip -d xlsum.csv.gz

# Human chromosome generator dataset generated by https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
# 1200 rows, each 800 characters long (939K)
wget --no-clobber -O human_protein_1200row_800len.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt
```

## IDE Integrations
Expand Down
70 changes: 46 additions & 24 deletions cli/wc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

import sys
import sys, os
import argparse
import stringzilla
from stringzilla import File, Str
Expand Down Expand Up @@ -30,6 +30,13 @@ def parse_arguments():
parser.add_argument(
"-w", "--words", action="store_true", help="print the word counts"
)
parser.add_argument(
"--files0-from",
metavar="filename",
help="Read input from the files specified by NUL-terminated names in file F;"
" If F is - then read names from standard input",
)

parser.add_argument("--version", action="version", version=stringzilla.__version__)
return parser.parse_args()

Expand All @@ -45,27 +52,28 @@ def wc(file_path, args):
except RuntimeError: # File gives a RuntimeError if the file does not exist
return f"No such file: {file_path}", False

line_count = mapped_bytes.count("\n")
word_count = mapped_bytes.count(" ") + 1
char_count = mapped_bytes.__len__()
counts = {
"line_count": line_count,
"word_count": word_count,
"char_count": char_count,
}
counts = {}
if args.lines:
counts["line_count"] = mapped_bytes.count("\n")
if args.words:
counts["word_count"] = mapped_bytes.count(" ") + 1
if args.chars:
counts["char_count"] = mapped_bytes.__len__()

if args.max_line_length:
max_line_length = max(len(line) for line in str(mapped_bytes).split("\n"))
max_line_length = max(len(line) for line in mapped_bytes.split("\n"))
counts["max_line_length"] = max_line_length

if args.bytes or args.chars:
byte_count = char_count # assume 1 char = 1 byte
counts["byte_count"] = byte_count
if args.bytes:
if args.chars:
counts["byte_count"] = counts["char_count"]
else:
counts["byte_count"] = mapped_bytes.__len__()

return counts, True


def format_output(counts, args):
def format_output(counts, args, just):
selected_counts = []
if args.lines:
selected_counts.append(counts["line_count"])
Expand All @@ -74,18 +82,18 @@ def format_output(counts, args):
if args.chars:
selected_counts.append(counts["char_count"])
if args.bytes:
selected_counts.append(counts.get("byte_count", counts["char_count"]))
selected_counts.append(counts["byte_count"])
if args.max_line_length:
selected_counts.append(counts.get("max_line_length", 0))

if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
selected_counts = [
counts["line_count"],
counts["word_count"],
counts["char_count"],
]
return " ".join(str(count).rjust(just) for count in selected_counts)


return " ".join(str(count) for count in selected_counts)
def get_files_from(fn):
f = open(fn, "r")
s = f.read()
f.close()
return [x for x in s.split("\0") if os.path.isfile(x)]


def main():
Expand All @@ -97,19 +105,33 @@ def main():
"max_line_length": 0,
"byte_count": 0,
}
if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]):
args.lines = 1
args.words = 1
args.bytes = 1

# wc uses the file size to determine column width when printing
if args.files0_from:
if args.files[0] == "-":
args.files = get_files_from(args.files0_from)
if len(args.files) == 0:
# print(" No filenames found in ", args.files0_from)
exit(0)

just = max(len(str(os.stat(fn).st_size)) for fn in args.files)

for file_path in args.files:
counts, success = wc(file_path, args)
if success:
for key in total_counts.keys():
total_counts[key] += counts.get(key, 0)
output = format_output(counts, args) + f" {file_path}"
output = format_output(counts, args, just) + f" {file_path}"
print(output)
else:
print(counts)

if len(args.files) > 1:
total_output = format_output(total_counts, args) + " total"
total_output = format_output(total_counts, args, just) + " total"
print(total_output)


Expand Down
9 changes: 3 additions & 6 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -2194,7 +2194,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_c

// Verify the remaining part of the needle
sz_size_t remaining = h_length - (found - h);
if (remaining < suffix_length) return SZ_NULL_CHAR;
if (remaining < n_length) return SZ_NULL_CHAR;
if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;

// Adjust the position.
Expand Down Expand Up @@ -2246,7 +2246,6 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_si
}

SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {

// This almost never fires, but it's better to be safe than sorry.
if (h_length < n_length || !n_length) return SZ_NULL_CHAR;

Expand Down Expand Up @@ -2620,11 +2619,9 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial( //
// Skip the matching prefixes and suffixes, they won't affect the distance.
for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
longer != a_end && shorter != b_end && *longer == *shorter;
++longer, ++shorter, --longer_length, --shorter_length)
;
++longer, ++shorter, --longer_length, --shorter_length);
for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
--longer_length, --shorter_length)
;
--longer_length, --shorter_length);

// Bounded computations may exit early.
if (bound) {
Expand Down
Loading

0 comments on commit 279106b

Please sign in to comment.