Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bitap_string_match algo #11060

Merged
merged 5 commits into from
Oct 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -135,5 +135,5 @@ omit = [
sort = "Cover"

[tool.codespell]
ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
ignore-words-list = "3rt,ans,bitap,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
skip = "./.*,*.json,ciphers/prehistoric_men.txt,project_euler/problem_022/p022_names.txt,pyproject.toml,strings/dictionary.txt,strings/words.txt"
79 changes: 79 additions & 0 deletions strings/bitap_string_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Bitap exact string matching
https://en.wikipedia.org/wiki/Bitap_algorithm

Searches for a pattern inside text, and returns the index of the first occurrence
of the pattern. Both text and pattern consist of lowercase alphabetical characters only.

Complexity: O(m*n)
n = length of text
m = length of pattern

Python doctests can be run using this command:
python3 -m doctest -v bitap_string_match.py
"""


def bitap_string_match(text: str, pattern: str) -> int:
"""
Retrieves the index of the first occurrence of pattern in text.

Args:
text: A string consisting only of lowercase alphabetical characters.
pattern: A string consisting only of lowercase alphabetical characters.

Returns:
int: The index where pattern first occurs. Return -1 if not found.

>>> bitap_string_match('abdabababc', 'ababc')
5
>>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a')
0
>>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso')
0
>>> bitap_string_match('abdabababc', '')
0
>>> bitap_string_match('abdabababc', 'c')
9
>>> bitap_string_match('abdabababc', 'fofosdfo')
-1
>>> bitap_string_match('abdab', 'fofosdfo')
-1
"""
if not pattern:
return 0
m = len(pattern)
if m > len(text):
return -1

# Initial state of bit string 1110
state = ~1
# Bit = 0 if character appears at index, and 1 otherwise
pattern_mask: list[int] = [~0] * 27 # 1111

for i, char in enumerate(pattern):
# For the pattern mask for this character, set the bit to 0 for each i
# the character appears.
pattern_index: int = ord(char) - ord("a")
pattern_mask[pattern_index] &= ~(1 << i)

for i, char in enumerate(text):
text_index = ord(char) - ord("a")
# If this character does not appear in pattern, it's pattern mask is 1111.
# Performing a bitwise OR between state and 1111 will reset the state to 1111
# and start searching the start of pattern again.
state |= pattern_mask[text_index]
state <<= 1

# If the mth bit (counting right to left) of the state is 0, then we have
# found pattern in text
if (state & (1 << m)) == 0:
return i - m + 1

return -1


if __name__ == "__main__":
import doctest

doctest.testmod()