std.mem: Split tokenize into 3 versions by delimiter type: full, an…

…y, and scalar This allows users to choose which version they need for their particular use case, as the previous default (now the 'any' version) was (1) not always the desired type of delimiter and (2) performed worse than the scalar version if the delimiter was a single item.
ziglang · May 6, 2023 · 19aea1d · 19aea1d
1 parent 29c48ef
commit 19aea1d
Showing 1 changed file with 169 additions and 54 deletions.
diff --git a/lib/std/mem.zig b/lib/std/mem.zig
@@ -1862,72 +1862,117 @@ test "byteSwapAllFields" {
  }, s);
 }
 
+/// Deprecated: use `tokenizeAny`, `tokenizeFull`, or `tokenizeScalar`
+pub const tokenize = tokenizeAny;
+
 /// Returns an iterator that iterates over the slices of `buffer` that are not
-/// any of the bytes in `delimiter_bytes`.
+/// any of the items in `delimiters`.
 ///
-/// `tokenize(u8, " abc def  ghi ", " ")` will return slices
+/// `tokenizeAny(u8, " abc|def || ghi ", " |")` will return slices
 /// for "abc", "def", "ghi", null, in that order.
 ///
 /// If `buffer` is empty, the iterator will return null.
-/// If `delimiter_bytes` does not exist in buffer,
+/// If none of `delimiters` exist in buffer,
+/// the iterator will return `buffer`, null, in that order.
+///
+/// See also: `tokenizeFull`, `tokenizeScalar`,
+/// `splitFull`,`splitAny`, `splitScalar`,
+/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeAny(comptime T: type, buffer: []const T, delimiters: []const T) TokenIterator(T, .any) {
+ return .{
+ .index = 0,
+ .buffer = buffer,
+ .delimiter = delimiters,
+ };
+}
+
+/// Returns an iterator that iterates over the slices of `buffer` that are not
+/// the sequence in `delimiter`.
+///
+/// `tokenizeFull(u8, "<>abc><def<><>ghi", "<>")` will return slices
+/// for "abc><def", "ghi", null, in that order.
+///
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
+/// The delimiter length must not be zero.
 ///
-/// See also: `split` and `splitBackwards`.
-pub fn tokenize(comptime T: type, buffer: []const T, delimiter_bytes: []const T) TokenIterator(T) {
+/// See also: `tokenizeAny`, `tokenizeScalar`,
+/// `splitFull`,`splitAny`, and `splitScalar`
+/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeFull(comptime T: type, buffer: []const T, delimiter: []const T) TokenIterator(T, .full) {
+ assert(delimiter.len != 0);
  return .{
  .index = 0,
  .buffer = buffer,
- .delimiter_bytes = delimiter_bytes,
+ .delimiter = delimiter,
  };
 }
 
-test "tokenize" {
- var it = tokenize(u8, " abc def ghi ", " ");
+/// Returns an iterator that iterates over the slices of `buffer` that are not
+/// `delimiter`.
+///
+/// `tokenizeScalar(u8, " abc def ghi ", ' ')` will return slices
+/// for "abc", "def", "ghi", null, in that order.
+///
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter` does not exist in buffer,
+/// the iterator will return `buffer`, null, in that order.
+///
+/// See also: `tokenizeAny`, `tokenizeFull`,
+/// `splitFull`,`splitAny`, and `splitScalar`
+/// `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeScalar(comptime T: type, buffer: []const T, delimiter: T) TokenIterator(T, .scalar) {
+ return .{
+ .index = 0,
+ .buffer = buffer,
+ .delimiter = delimiter,
+ };
+}
+
+test "tokenizeScalar" {
+ var it = tokenizeScalar(u8, " abc def ghi ", ' ');
  try testing.expect(eql(u8, it.next().?, "abc"));
  try testing.expect(eql(u8, it.peek().?, "def"));
  try testing.expect(eql(u8, it.next().?, "def"));
  try testing.expect(eql(u8, it.next().?, "ghi"));
  try testing.expect(it.next() == null);
 
- it = tokenize(u8, "..\\bob", "\\");
+ it = tokenizeScalar(u8, "..\\bob", '\\');
  try testing.expect(eql(u8, it.next().?, ".."));
  try testing.expect(eql(u8, "..", "..\\bob"[0..it.index]));
  try testing.expect(eql(u8, it.next().?, "bob"));
  try testing.expect(it.next() == null);
 
- it = tokenize(u8, "//a/b", "/");
+ it = tokenizeScalar(u8, "//a/b", '/');
  try testing.expect(eql(u8, it.next().?, "a"));
  try testing.expect(eql(u8, it.next().?, "b"));
  try testing.expect(eql(u8, "//a/b", "//a/b"[0..it.index]));
  try testing.expect(it.next() == null);
 
- it = tokenize(u8, "|", "|");
+ it = tokenizeScalar(u8, "|", '|');
  try testing.expect(it.next() == null);
  try testing.expect(it.peek() == null);
 
- it = tokenize(u8, "", "|");
+ it = tokenizeScalar(u8, "", '|');
  try testing.expect(it.next() == null);
  try testing.expect(it.peek() == null);
 
- it = tokenize(u8, "hello", "");
- try testing.expect(eql(u8, it.next().?, "hello"));
- try testing.expect(it.next() == null);
-
- it = tokenize(u8, "hello", " ");
+ it = tokenizeScalar(u8, "hello", ' ');
  try testing.expect(eql(u8, it.next().?, "hello"));
  try testing.expect(it.next() == null);
 
- var it16 = tokenize(
+ var it16 = tokenizeScalar(
  u16,
  std.unicode.utf8ToUtf16LeStringLiteral("hello"),
- std.unicode.utf8ToUtf16LeStringLiteral(" "),
+ ' ',
  );
  try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello")));
  try testing.expect(it16.next() == null);
 }
 
-test "tokenize (multibyte)" {
- var it = tokenize(u8, "a|b,c/d e", " /,|");
+test "tokenizeAny (multibyte)" {
+ var it = tokenizeAny(u8, "a|b,c/d e", " /,|");
  try testing.expect(eql(u8, it.next().?, "a"));
  try testing.expect(eql(u8, it.peek().?, "b"));
  try testing.expect(eql(u8, it.next().?, "b"));
@@ -1937,7 +1982,11 @@ test "tokenize (multibyte)" {
  try testing.expect(it.next() == null);
  try testing.expect(it.peek() == null);
 
- var it16 = tokenize(
+ it = tokenizeAny(u8, "hello", "");
+ try testing.expect(eql(u8, it.next().?, "hello"));
+ try testing.expect(it.next() == null);
+
+ var it16 = tokenizeAny(
  u16,
  std.unicode.utf8ToUtf16LeStringLiteral("a|b,c/d e"),
  std.unicode.utf8ToUtf16LeStringLiteral(" /,|"),
@@ -1950,18 +1999,68 @@ test "tokenize (multibyte)" {
  try testing.expect(it16.next() == null);
 }
 
+test "tokenizeFull" {
+ var it = tokenizeFull(u8, "a<>b<><>c><>d><", "<>");
+ try testing.expectEqualStrings("a", it.next().?);
+ try testing.expectEqualStrings("b", it.peek().?);
+ try testing.expectEqualStrings("b", it.next().?);
+ try testing.expectEqualStrings("c>", it.next().?);
+ try testing.expectEqualStrings("d><", it.next().?);
+ try testing.expect(it.next() == null);
+ try testing.expect(it.peek() == null);
+
+ var it16 = tokenizeFull(
+ u16,
+ std.unicode.utf8ToUtf16LeStringLiteral("a<>b<><>c><>d><"),
+ std.unicode.utf8ToUtf16LeStringLiteral("<>"),
+ );
+ try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
+ try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b")));
+ try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c>")));
+ try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d><")));
+ try testing.expect(it16.next() == null);
+}
+
 test "tokenize (reset)" {
- var it = tokenize(u8, " abc def ghi ", " ");
- try testing.expect(eql(u8, it.next().?, "abc"));
- try testing.expect(eql(u8, it.next().?, "def"));
- try testing.expect(eql(u8, it.next().?, "ghi"));
+ {
+ var it = tokenizeAny(u8, " abc def ghi ", " ");
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
 
- it.reset();
+  it.reset();
 
- try testing.expect(eql(u8, it.next().?, "abc"));
- try testing.expect(eql(u8, it.next().?, "def"));
- try testing.expect(eql(u8, it.next().?, "ghi"));
- try testing.expect(it.next() == null);
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
+ try testing.expect(it.next() == null);
+ }
+ {
+ var it = tokenizeFull(u8, "<><>abc<>def<><>ghi<>", "<>");
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
+
+ it.reset();
+
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
+ try testing.expect(it.next() == null);
+ }
+ {
+ var it = tokenizeScalar(u8, " abc def ghi ", ' ');
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
+
+ it.reset();
+
+ try testing.expect(eql(u8, it.next().?, "abc"));
+ try testing.expect(eql(u8, it.next().?, "def"));
+ try testing.expect(eql(u8, it.next().?, "ghi"));
+ try testing.expect(it.next() == null);
+ }
 }
 
 /// Deprecated: use `splitFull`, `splitAny`, or `splitScalar`
@@ -1978,8 +2077,8 @@ pub const split = splitFull;
 /// The delimiter length must not be zero.
 ///
 /// See also: `splitAny`, `splitScalar`, `splitBackwardsFull`,
-/// `splitBackwardsAny`,`splitBackwardsScalar`, and
-/// `tokenize`.
+/// `splitBackwardsAny`,`splitBackwardsScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitIterator(T, .full) {
  assert(delimiter.len != 0);
  return .{
@@ -1999,8 +2098,8 @@ pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) Spli
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitFull`, `splitScalar`, `splitBackwardsFull`,
-/// `splitBackwardsAny`,`splitBackwardsScalar`, and
-/// `tokenize`.
+/// `splitBackwardsAny`,`splitBackwardsScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitIterator(T, .any) {
  return .{
  .index = 0,
@@ -2019,8 +2118,8 @@ pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) Spli
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitFull`, `splitAny`, `splitBackwardsFull`,
-/// `splitBackwardsAny`,`splitBackwardsScalar`, and
-/// `tokenize`.
+/// `splitBackwardsAny`,`splitBackwardsScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitScalar(comptime T: type, buffer: []const T, delimiter: T) SplitIterator(T, .scalar) {
  return .{
  .index = 0,
@@ -2176,8 +2275,8 @@ pub const splitBackwards = splitBackwardsFull;
 /// The delimiter length must not be zero.
 ///
 /// See also: `splitBackwardsAny`, `splitBackwardsScalar`,
-/// `splitFull`, `splitAny`,`splitScalar`, and
-/// `tokenize`.
+/// `splitFull`, `splitAny`,`splitScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitBackwardsIterator(T, .full) {
  assert(delimiter.len != 0);
  return .{
@@ -2197,8 +2296,8 @@ pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []cons
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitBackwardsFull`, `splitBackwardsScalar`,
-/// `splitFull`, `splitAny`,`splitScalar`, and
-/// `tokenize`.
+/// `splitFull`, `splitAny`,`splitScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitBackwardsIterator(T, .any) {
  return .{
  .index = buffer.len,
@@ -2217,8 +2316,8 @@ pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []cons
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitBackwardsFull`, `splitBackwardsAny`,
-/// `splitFull`, `splitAny`,`splitScalar`, and
-/// `tokenize`.
+/// `splitFull`, `splitAny`,`splitScalar`,
+/// `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsScalar(comptime T: type, buffer: []const T, delimiter: T) SplitBackwardsIterator(T, .scalar) {
  return .{
  .index = buffer.len,
@@ -2548,10 +2647,13 @@ test "endsWith" {
 
 pub const DelimiterType = enum { full, any, scalar };
 
-pub fn TokenIterator(comptime T: type) type {
+pub fn TokenIterator(comptime T: type, comptime delimiter_type: DelimiterType) type {
  return struct {
  buffer: []const T,
- delimiter_bytes: []const T,
+ delimiter: switch (delimiter_type) {
+ .full, .any => []const T,
+ .scalar => T,
+ },
  index: usize,
 
  const Self = @This();
@@ -2568,15 +2670,18 @@ pub fn TokenIterator(comptime T: type) type {
  /// complete. Does not advance to the next token.
  pub fn peek(self: *Self) ?[]const T {
  // move to beginning of token
- while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+ while (self.index < self.buffer.len and self.isDelimiter(self.index)) : (self.index += switch (delimiter_type) {
+ .full => self.delimiter.len,
+ .any, .scalar => 1,
+ }) {}
  const start = self.index;
  if (start == self.buffer.len) {
  return null;
  }
 
  // move to end of token
  var end = start;
- while (end < self.buffer.len and !self.isSplitByte(self.buffer[end])) : (end += 1) {}
+ while (end < self.buffer.len and !self.isDelimiter(end)) : (end += 1) {}
 
  return self.buffer[start..end];
  }
@@ -2585,7 +2690,10 @@ pub fn TokenIterator(comptime T: type) type {
  pub fn rest(self: Self) []const T {
  // move to beginning of token
  var index: usize = self.index;
- while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
+ while (index < self.buffer.len and self.isDelimiter(index)) : (index += switch (delimiter_type) {
+ .full => self.delimiter.len,
+ .any, .scalar => 1,
+ }) {}
  return self.buffer[index..];
  }
 
@@ -2594,13 +2702,20 @@ pub fn TokenIterator(comptime T: type) type {
  self.index = 0;
  }
 
- fn isSplitByte(self: Self, byte: T) bool {
- for (self.delimiter_bytes) |delimiter_byte| {
- if (byte == delimiter_byte) {
- return true;
- }
+ fn isDelimiter(self: Self, index: usize) bool {
+ switch (delimiter_type) {
+ .full => return startsWith(T, self.buffer[index..], self.delimiter),
+ .any => {
+ const item = self.buffer[index];
+ for (self.delimiter) |delimiter_item| {
+ if (item == delimiter_item) {
+ return true;
+ }
+ }
+ return false;
+ },
+ .scalar => return self.buffer[index] == self.delimiter,
  }
- return false;
  }
  };
 }