-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparser.q
85 lines (76 loc) · 3.4 KB
/
parser.q
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
\d .nlp
.p.import[`sys;:;`:argv;()]; / spacy expects python be the main process
// Python functions for running spacy
p)def get_doc_info(parser,tokenAttrs,opts,text):
doc=parser(text)
res=[[getattr(w,a)for w in doc]for a in tokenAttrs]
if('sentChars' in opts): # indices of first+last char per sentence
res.append([(s.start_char,s.end_char)for s in doc.sents])
if('sentIndices' in opts): # index of first token per sentence
res.append([s.start for s in doc.sents])
res.append([w.is_punct or w.is_bracket or w.is_space for w in doc])
return res
parser.i.parseText:.p.get[`get_doc_info;<];
parser.i.cleanUTF8:.p.import[`builtins;`:bytes.decode;<][;`errors pykw`ignore]$["x"]@;
// Dependent options
parser.i.depOpts:(!). flip(
(`keywords; `tokens`isStop);
(`sentChars; `sbd`sentIndices);
(`sentIndices;`sbd);
(`uniPOS; `tagger);
(`pennPOS; `tagger))
// Map from q-style attribute names to spacy
parser.i.q2spacy:(!). flip(
(`likeEmail; `like_email);
(`likeNumber; `like_num);
(`likeURL; `like_url);
(`isStop; `is_stop);
(`tokens; `lower_);
(`lemmas; `lemma_);
(`uniPOS; `pos_);
(`pennPOS; `tag_);
(`starts; `idx))
// Create new parser
// Valid opts : text keywords likeEmail likeNumber likeURL isStop tokens lemmas uniPOS pennPOS starts sentChars sentIndices
parser.i.newParser:{[lang;opts]
opts:distinct opts,raze parser.i.depOpts colnames:opts;
disabled:`ner`tagger`parser except opts;
model:.p.import[`spacy;`:load][lang;`disable pykw disabled];
if[(`sbd in opts)&`parser in disabled;model[`:add_pipe]model[`:create_pipe;`sbd]];
tokenAttrs:parser.i.q2spacy key[parser.i.q2spacy]inter opts;
pyParser:parser.i.parseText[model;tokenAttrs;opts;];
parser.i.runParser[pyParser;colnames;opts]}
// Operations that must be done in q, or give better performance in q
parser.i.runParser:{[pyParser;colnames;opts;docs]
parsed:parser.i.unpack[pyParser;opts]each t:parser.i.cleanUTF8 each docs;
if[`keywords in opts;parsed[`keywords]:TFIDF parsed];
colnames#@[parsed;`text;:;t]}
// Operations that must be done in q, or give better performance in q
parser.i.unpack:{[pyParser;opts;text]
names:inter[key[parser.i.q2spacy],`sentChars`sentIndices;opts],`isPunct;
doc:names!pyParser text;
doc:@[doc;names inter`tokens`lemmas`uniPOS`pennPOS;`$];
if[`entities in names;doc:.[doc;(`entities;::;0 1);`$]]
if[`isStop in names;
if[`tokens in names;doc[`isStop]|:doc[`tokens ]in i.stopwords ];
if[`uniPOS in names;doc[`isStop]|:doc[`uniPOS ]in i.stopUniPOS ];
if[`pennPOS in names;doc[`isStop]|:doc[`pennPOS]in i.stopPennPOS];
];
doc:parser.i.removePunct parser.i.adjustIndices[text]doc;
if[`sentIndices in opts;
doc[`sentIndices]@:unique:value last each group doc`sentIndices;
if[`sentChars in opts;doc[`sentChars]@:unique]
];
@[doc;`;:;::]}
// Python indexes into strings by char instead of byte, so must be modified to index a q string
parser.i.adjustIndices:{[text;doc]
adj:cont-til count cont:where text within"\200\277";
if[`starts in cols doc;doc[`starts ]+:adj binr 1+doc`starts ];
if[`sentChars in cols doc;doc[`sentChars]+:adj binr 1+doc`sentChars];
doc}
// Removes punctuation and space tokens and updates indices
parser.i.removePunct:{[doc]
doc:@[doc;key[parser.i.q2spacy]inter k:cols doc;@[;where not doc`isPunct]];
idx:sums 0,not doc`isPunct;
if[`sentIndices in k;doc:@[doc;`sentIndices;idx]];
doc _`isPunct}