forked from PLN-FaMAF/PLN-2015
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
109 lines (92 loc) · 3.74 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Corpus preprocessing script
Usage:
preprocess.py [options]
preprocess.py --split-in=<num-splits> --run-part=<num-part>
preprocess.py --increment-ner
preprocess.py -h | --help | --version
Options:
-h --help Show this screen
--multiple-cores=<num-cores> Number of cores (use all to use every processor)
--increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
--version Version number
"""
import logging
from docopt import docopt
import iepy
import multiprocessing
iepy.setup(__file__)
from iepy.data.db import DocumentManager
# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
from iepy.preprocess.segmenter import SyntacticSegmenterRunner
from iepy.preprocess.tokenizer import TokenizeSentencerRunner
from iepy.preprocess.ner.combiner import CombinedNERRunner, KindPreferenceCombinedNERRunner
from process.position import PositionNERRunner
from process.date import DateNERRunner
from process.person import PersonNERRunner
from process.dedication import DedicationNERRunner
from process.designation import DesignationNERRunner
from process.designation_type import DesignationTypeNERRunner
from process.clear import ClearEntities
class ParallelDocManager(DocumentManager):
def mines_of(self, qset, number_of_processors, my_id):
K = number_of_processors
N = my_id
clause = 'id %%%% %s = %s' % (K, N)
return qset.extra(where=[clause])
def start_preprocess(docs, increment_ner):
pipeline = PreProcessPipeline([
TokenizeSentencerRunner(override=True),
SyntacticSegmenterRunner(),
# ClearEntities('DESIGNATION'),
KindPreferenceCombinedNERRunner([
PositionNERRunner(),
DateNERRunner(),
PersonNERRunner(),
DesignationNERRunner(),
DedicationNERRunner(),
DesignationTypeNERRunner()
], override=True,
rank=('DATE','POSITION', 'DEDICATION', 'DESIGNATION', 'PERSON', 'DESIGNATION_TYPE'))
], docs)
pipeline.process_everything()
if __name__ == '__main__':
logger = logging.getLogger(u'preprocess')
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, format='%(message)s')
opts = docopt(__doc__, version=iepy.__version__)
increment_ner = opts['--increment-ner']
dm = ParallelDocManager()
all_docs = dm.get_documents_lacking_preprocess(
[PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing])
multiple_cores = opts.get('--multiple-cores')
split_in = opts.get("--split-in")
run_part = opts.get("--run-part")
if multiple_cores:
if multiple_cores == "all":
multiple_cores = multiprocessing.cpu_count()
try:
multiple_cores = int(multiple_cores)
except ValueError:
logger.error("Invalid number of cores")
exit(1)
for i in range(multiple_cores):
process = multiprocessing.Process(
target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
)
process.start()
elif split_in:
try:
split_in = int(split_in)
run_part = int(run_part) - 1
except ValueError:
logger.error("Invalid split")
exit(1)
if run_part < 0 or run_part > split_in:
logger.error("Parts must be between 1 and {}".format(split_in))
exit(1)
docs = dm.mines_of(all_docs, split_in, run_part)
start_preprocess(docs, increment_ner)
else:
start_preprocess(all_docs, increment_ner)