-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtopsim-cli
executable file
·77 lines (54 loc) · 1.92 KB
/
topsim-cli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#! /usr/bin/env python3
"""
Usage:
topsim-cli <query> [options] [<file>]
topsim-cli --help
Options:
-I Case-sensitive matching.
-k <k> Maximum number of search results. [default: 1]
--tie Include all the results with the same similarity of the "k"-th result. May return more than "k" results.
-s, --search Search the query within each line rather than against the whole line, by preferring partial matching of the line.
Tversky similarity is used instead of Jaccard similarity.
-e <e> Parameter for Tversky similarity. [default: 0.001]
--mapping=<mapping> Map each string to a set of either "gram"s or "word"s. [default: gram]
--numgrams=<numgrams> Number of characters for each gram when mapping by "gram". [default: 2]
--quiet Do not print additional information to standard error.
"""
import sys
from functools import partial
import os
from docopt import docopt
from extratools.debugtools import stopwatch, peakmem
from topsim import TopSim
argv = docopt(__doc__)
print2 = partial(print, file=(open(os.devnull, 'w') if argv["--quiet"] else sys.stderr))
def printResourceUsage():
print2("{:.2} sec | {:.2} MB".format(
stopwatch()[1],
peakmem() / 1024 / 1024
))
sRawStrs = [
line.rstrip('\r\n')
for line in (open(argv["<file>"]) if argv["<file>"] else sys.stdin)
]
print2("Indexing...", end=" ")
ts = TopSim(
sRawStrs,
argv["-I"],
mapping=argv["--mapping"],
numGrams=int(argv["--numgrams"])
)
printResourceUsage()
print2("Searching...", end=" ")
rBest = ts.search(
argv["<query>"],
int(argv["-k"]),
argv["--tie"],
"tversky" if argv["--search"] else "jaccard",
float(argv["-e"])
)
printResourceUsage()
print2()
for sim, lns in rBest:
for ln in lns:
print("{}\t{:.4}".format(sRawStrs[ln], sim))