-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
29 lines (22 loc) · 975 Bytes
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# https://nlpforhackers.io/splitting-text-into-sentences/
# https://en.wikipedia.org/wiki/Search_engine_indexing
import sys
import argparse
import string
from myindex.indexer import Indexer
from myindex.content import get_content
from myindex.tokenize import Tokenizer
def main():
parser = argparse.ArgumentParser(description='Load file and tokenize tags')
parser.add_argument('filename', type=str, help='filename to process')
parser.add_argument('-f', '--filter', type=str, default=string.punctuation)
parser.add_argument('-l', '--language', type=str, default='english')
parser.add_argument('-o', '--output', type=str, help='output filename')
args = parser.parse_args()
tokenizer = Tokenizer(language=args.language)
content = get_content(args.filename)
indexer = Indexer(args.filename)
indexer.index(content, tokenizer, skip=args.filter)
indexer.save(args.filename + '.idx')
if __name__ == '__main__':
sys.exit(main())