Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pydatastructs/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
__all__ = []

from . import trie
from . import (
trie,
string_matching_algorithms
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rename the file as algorithms.py from string_matching_algorithms.py. We would keep all the string related algorithms in this file.

)

from .trie import (
Trie
)

__all__.extend(trie.__all__)

from .string_matching_algorithms import (
find_string
)

__all__.extend(string_matching_algorithms.__all__)
136 changes: 136 additions & 0 deletions pydatastructs/strings/string_matching_algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from pydatastructs.linear_data_structures.arrays import (
OneDimensionalArray)

__all__ = [
'find_string'
]

def find_string(text: str, pattern: str, algorithm: str) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation for this should have the list of supported algorithms. For example,

algorithm: str
The algorithm which should be used for
computing a minimum spanning tree.
Currently the following algorithms are
supported,
'kruskal' -> Kruskal's algorithm as given in
[1].
'prim' -> Prim's algorithm as given in [2].

Full doc string of the above example is as follows,

"""
Computes a minimum spanning tree for the given
graph and algorithm.
Parameters
==========
graph: Graph
The graph whose minimum spanning tree
has to be computed.
algorithm: str
The algorithm which should be used for
computing a minimum spanning tree.
Currently the following algorithms are
supported,
'kruskal' -> Kruskal's algorithm as given in
[1].
'prim' -> Prim's algorithm as given in [2].
Returns
=======
mst: Graph
A minimum spanning tree using the implementation
same as the graph provided in the input.
Examples
========
>>> from pydatastructs import Graph, AdjacencyListGraphNode
>>> from pydatastructs import minimum_spanning_tree
>>> u = AdjacencyListGraphNode('u')
>>> v = AdjacencyListGraphNode('v')
>>> G = Graph(u, v)
>>> G.add_edge(u.name, v.name, 3)
>>> mst = minimum_spanning_tree(G, 'kruskal')
>>> u_n = mst.neighbors(u.name)
>>> mst.get_edge(u.name, u_n[0].name).value
3
References
==========
.. [1] https://en.wikipedia.org/wiki/Kruskal%27s_algorithm
.. [2] https://en.wikipedia.org/wiki/Prim%27s_algorithm
Note
====
The concept of minimum spanning tree is valid only for
connected and undirected graphs. So, this function
should be used only for such graphs. Using with other
types of graphs may lead to unwanted results.

Adding note is optional in a doc string.

"""API of finding occurrence of a pattern string within another string or body of text.
Parameters
----------
text: str
A text, set of characters can include alphabets, numbers , special characters and blank spaces
pattern: str
A text, set of characters can include alphabets, numbers , special characters and blank spaces
algorithm: str
A valid algorithm name
Returns
-------
bool
True if pattern occurs in the string, else False
Examples
--------
>>> from pydatastructs.strings.string_matching_algorithms import find_string
>>> find_string("aefoaefcdaefcdaed", "aefcdaed", algorithm = "kmp")
True
>>> find_string("aefoaefcdaefcdaed", "aefcdaedz", algorithm = "kmp")
False
"""
return eval(algorithm + "('" + text + "','" + pattern + "')")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid using eval. Please use the pattern similar to the one shown below,

import pydatastructs.graphs.algorithms as algorithms
func = "_minimum_spanning_tree_" + algorithm + "_" + graph._impl
if not hasattr(algorithms, func):
raise NotImplementedError(
"Currently %s algoithm for %s implementation of graphs "
"isn't implemented for finding minimum spanning trees."
%(algorithm, graph._impl))
return getattr(algorithms, func)(graph)



def kmp(string: str, substring: str) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better to name it as, _knuth_morris_pratt.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation is not needed here as it would be a non-public function.

"""Determine whether the substring appears somewhere in the string using Knuth–Morris–Pratt algorithm
Parameters
----------
string: str
A text, set of characters
substring: str
A pattern/substring that is searched for in the string
Returns
-------
bool
Whether substring exists in the string or not
Examples
--------
>>> from pydatastructs.strings.string_matching_algorithms import kmp
>>> kmp("aefoaefcdaefcdaed", "aefcdaed")
True
>>> kmp("aefoaefcdaefcdaed", "aefcdaedz")
False
References
-------
.. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
.. [2] https://towardsdatascience.com/pattern-search-with-the-knuth-morris-pratt-kmp-algorithm-8562407dba5b
.. [3] https://iopscience.iop.org/article/10.1088/1742-6596/1345/4/042005/pdf
"""
patternsInSubString = _buildPattern(substring)
return _doMatch(string, substring, patternsInSubString)


def _buildPattern(substring: str) -> OneDimensionalArray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same suggestions as in _doMatch.

"""Check for patterns existing in the substring
Parameters
----------
substring: str
A text, set of characters
Returns
-------
patterns: OneDimensionalArray
Returns an array of indicies. For a given index if value > -1
represents that the suffix found at the index, is also the prefix
at the value index. If value is -1, then there is no prefix that is also
a suffix.
"""
j = 0
i = 1
patterns = OneDimensionalArray(int, len(substring))
patterns.fill(-1)
while i < len(substring):
if substring[i] is substring[j]:
# A prefix that is also a suffix
patterns[i] = j
i += 1
j += 1
elif j > 0:
# Check the previous existing pattern
j = patterns[j - 1] + 1
else:
i += 1
return patterns


def _doMatch(string: str, substring: str, patterns: OneDimensionalArray) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please follow snake case instead of camel case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_doMatch -> _do_match. It would be better if we define this function inside _knuth_morris_pratt as for now it is called only inside its scope.

"""Check if the string exists in the substring
Parameters
----------
string: str
A text, set of characters
substring: str
A pattern/substring that is searched for in the string
patterns: OneDimensionalArray
An array of integers, each value < len(patterns)
Returns
-------
bool
Whether substring exists in the string or not
"""
i = 0
j = 0
while i < len(string):
if string[i] is substring[j]:
i += 1
j += 1
elif j > 0:
j = patterns[j - 1] + 1
else:
i += 1
if j is len(substring):
return True
return False
26 changes: 26 additions & 0 deletions pydatastructs/strings/tests/test_string_matching_algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pydatastructs.strings.string_matching_algorithms import find_string

def test_kms():
_test_common_string_matching('kmp')


def _test_common_string_matching(algorithm):
true_text_pattern_dictionary = {
"Knuth-Morris-Pratt": "-Morris-",
"abcabcabcabdabcabdabcabca": "abcabdabcabca",
"aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd",
"aaaaaaaa": "aaa",
"fullstringmatch": "fullstringmatch"
}
for test_case_key in true_text_pattern_dictionary:
assert find_string(test_case_key, true_text_pattern_dictionary[test_case_key], algorithm) is True

false_text_pattern_dictionary = {
"Knuth-Morris-Pratt": "-Pratt-",
"abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
"aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
"fullstringmatch": "fullstrinmatch"
}

for test_case_key in false_text_pattern_dictionary:
assert find_string(test_case_key, false_text_pattern_dictionary[test_case_key], algorithm) is False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice work on test cases.