forked from rahulguptakota/paper-To-Reviewer-Matching-System
-
Notifications
You must be signed in to change notification settings - Fork 0
/
features_k9.py
57 lines (52 loc) · 1.75 KB
/
features_k9.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sys
from os import listdir
from os.path import isfile, join
from nltk.tokenize import sent_tokenize
import nltk
import xml.etree.ElementTree as ET
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice
graph = {}
with open('index.csv', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in spamreader:
if row[1] in graph.keys() and not row[0] in graph[row[1]]:
graph[row[1]].append(row[0])
else:
graph[row[1]] = [row[0],]
# for a,b in graph.items():
# print(a,b)
#f9 starts from here
i = 0
out_file = open('features_k9.txt','w')
with open('index.csv', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in spamreader:
if i==0:
i=1
else:
paper1 = join("data/",row[0]) + "/parsedData.xml"
paper2 = join("data/",row[1]) + "/parsedData.xml"
# print(paper1,paper2)
tree1 = ET.parse(paper1)
root1 = tree1.getroot()
tree2 = ET.parse(paper2)
root2 = tree2.getroot()
vect = TfidfVectorizer(min_df=1)
variant1 = root1[1][0]
variant2 = root2[1][0]
abstract1 = variant1.find('abstract')
abstract2 = variant2.find('abstract')
if abstract1 is None or abstract2 is None:
abstract1 = root1[0][0].find('bodyText')
# print(row[0] + " HEYYYYYY " + abstract1)
abstract2 = root2[0][0].find('bodyText')
# print(row[1] + " HEYYYYYY " + abstract2)
if abstract1 is not None and abstract2 is not None:
if abstract1.text is not None and abstract2.text is not None:
tfidf = vect.fit_transform([abstract2.text,abstract1.text])
out_file.write(row[0] +" "+row[1]+" "+ str((tfidf * tfidf.T).A[0][1])+"\n")
else:
out_file.write(row[0] +" "+row[1]+" 0"+"\n")
i+=1