-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClientNER.py
153 lines (130 loc) · 7.04 KB
/
ClientNER.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# coding=utf-8
import os
from pycorenlp import StanfordCoreNLP
from nltk.parse.stanford import StanfordDependencyParser
import json
# Assumes that the corenlp server is already running
# To run corenlp server, open cmd
# >cd ..
# >D:
# >cd D:\Ph.D Work\tools\NERs\stanford tools\stanford-corenlp-full-2018-02-27
# >java -mx7g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000 100000000000000000000000
nlp = StanfordCoreNLP('http://localhost:9000')
# import nltk
# from nltk.tag import StanfordPOSTagger
# from nltk import word_tokenize, sent_tokenize
# from nltk.tree import Tree
# from nltk.parse.stanford import StanfordParser
# from nltk.tag import StanfordNERTagger
def annotateKbpRelations(doc,reOutFile):
#Assumes that the corenlp server is already running
#To run corenlp server, open cmd
# >cd D:\phd work\tools\NERs\stanford tools\stanford-corenlp-full-2018-02-27
# >D:
# >java -mx7g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000 100000000000000000000000
nlp = StanfordCoreNLP('http://localhost:9000')
text = doc
# FOR STANFORD KBP Relations ANNOTATION
properties = {'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,coref,kbp', # 'coref.md.type': 'RULE',
'outputFormat': 'json',
'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'}
output = nlp.annotate(text, properties)#output is a dictionary
relations = output['sentences'][0]['kbp']
with open(reOutFile, 'w',encoding='utf-8') as outfile:
json.dump(relations, outfile, sort_keys=True, indent=4, ensure_ascii=False)
print(relations)
def annotateNER(doc,nerOutFile):
text = doc
# FOR STANFORD NER ANNOTATION
# i changed parse in annotators to depparse
properties = {'annotators': 'tokenize,ssplit,pos,lemma,ner,parse', 'outputFormat': 'conll', #changed outputFormat from json to conll
'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'}
output = nlp.annotate(text, properties)#output is a dictionary
# print(output['sentences'][0])
# print(output['sentences'][0]['entitymentions'])
# output.format('json')
# data = output
# with open(nerOutFile, 'w',encoding='utf-8') as outfile:
# data = data.replace('\r','')
# outfile.write(data)
# # json.dump(data, outfile, sort_keys=True, indent=4, ensure_ascii=False)
# print(output)
def generateREdatasetConllFormat(): #input is sentences, output is annotated data in conll format, I then manually tag SUBJECT OBJECT and RELATION in the generated data
#Assumes that the corenlp server is already running
#To run corenlp server, open cmd
# >cd ..
# >D:
# >cd D:\phd work\tools\NERs\stanford tools\stanford-corenlp-full-2018-02-27
# >java -mx7g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000 100000000000000000000000
nlp = StanfordCoreNLP('http://localhost:9000')
inFile = os.path.normpath("inputOKE\okeEvalInputTexts.txt")#inputOurFREdatasetNew2/sentences.txt")
outFile = os.path.normpath("inputOKE\okeEval.conll")#inputOurFREdatasetNew2/dataset.conll")
#outFile = os.path.normpath("inputOurFREdataset/dataset2.conll")
with open(inFile,'r',encoding='utf-8') as inFile:
text = inFile.read()
# text = "\"I\'m so glad you come before we began,\" said Nan, cheerfully."
# FOR STANFORD POS, NER, DEP ANNOTATION
# remove ner annotator for getting dashes (-) in one column
properties = {'annotators': 'ssplit,tokenize,pos,parse',
#properties = {'annotators': 'tokenize,pos,lemma,ner,depparse',
#'output.columns': 'idx,word,role,role,pos,ner,deprel,headidx',#this line is not working
'outputFormat': 'conll',
'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'}
output = nlp.annotate(text, properties)
data = output
with open(outFile, 'w',encoding='utf-8') as outfile:
data = data.replace('\r','')
outfile.write(data)
print(output)
def annotateStanfordCoref(doc):
#Assumes that the corenlp server is already running
#To run corenlp server, open cmd
# >cd ..
# >D:
# >cd D:\phd work\tools\NERs\stanford tools\stanford-corenlp-full-2018-02-27
# >java -mx7g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000 100000000000000000000000
nlp = StanfordCoreNLP('http://localhost:9000')
text = doc
# text = "Tom and Jane are good friends. They are cool. He knows a lot of things and so does she. His car is red, but hers is blue. It is older than hers. The big cat ate its dinner."
# For coreference annotation
# my_proxies = {'http': 'wireless.cust','https': 'wireless.cust'}
properties = {'annotators': 'dcoref', 'outputFormat': 'json'}#,'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'}
output = nlp.annotate(text, properties)
# x = output['corefs']
return output
def areCorefs(subjStartIndex,subjEndIndex, objStartIndex,objEndIndex, sentence):
print("inside areCorefs of ClientNER...........")
corenlp_output = annotateStanfordCoref(sentence)
for coref in corenlp_output['corefs']:
subjInCorefs = 0
objInCorefs = 0
bothInCorefs = 0
mentions = corenlp_output['corefs'][coref] #one set of co-referents. Its a list. Check if both subject and object are in this list.
if len(mentions)>=2:
#json files added for both.. process subj_start from json for both datasets now
# check if both subject and object are in the list of co-referents
for mention in mentions:
startIndex = mention['startIndex']-1 #because indices of coref start from 1, while indices of TACRED examples start from 0
endIndex = mention['endIndex']-2 #because indices of coref start from 1, and end at 1 more than the end indices of TACRED examples
if subjStartIndex == startIndex and subjEndIndex==endIndex:
subjInCorefs = 1
if objStartIndex == startIndex and objEndIndex==endIndex:
objInCorefs = 1
if subjInCorefs==1 and objInCorefs==1:
bothInCorefs = 1
break
if bothInCorefs == 1:
break
return bothInCorefs
def getDotDepParse(text):
# make sure nltk can find stanford-parser
path_to_jar = 'D:/phd work/tools/NERs/stanford tools/stanford-parser-full-2018-02-27/stanford-parser.jar'
path_to_models_jar = 'D:/phd work/tools/NERs/stanford tools/stanford-parser-full-2018-02-27//stanford-parser-3.9.1-models.jar'
dep_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
p = dep_parser.raw_parse(text)
for e in p:
p = e
break
print(p.to_dot())
# Copy paste the output to http://graphs.grevian.org/graph and press the Generate button. You should see the desired graph.