-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateNodeTable.py
86 lines (65 loc) · 2.55 KB
/
createNodeTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import xml.etree.ElementTree as ET
import pandas as pd
import threading
from tqdm import tqdm
df = pd.DataFrame(columns=['id', 'label', 'role'])
lock = threading.Lock()
count = 0
def savePatent(patent, lock):
global df, count
if df['label'].isin([patent]).any() == False:
lock.acquire()
df = df._append({'id': count, 'label': patent, 'role': "patent"}, ignore_index=True)
count+=1
lock.release()
def saveApplicants(applicants, lock):
global df, count
for applicant in applicants:
isThereSuchAnApplicant = (df['label'] == applicant) & (df['role'] == "applicant")
if isThereSuchAnApplicant.any() == False:
lock.acquire()
df = df._append({'id': count, 'label': applicant, 'role': "applicant"}, ignore_index=True)
count+=1
lock.release()
def saveInventors(inventors, lock):
global df, count
for inventor in inventors:
isThereSuchAnInventor = (df['label'] == inventor) & (df['role'] == "inventor")
if isThereSuchAnInventor.any() == False:
lock.acquire()
df = df._append({'id': count, 'label': inventor, 'role': "inventor"}, ignore_index=True)
count+=1
lock.release()
def createNodes():
df_temp = pd.DataFrame(columns=['patent'])
with tqdm(total=len(os.listdir("./patents")), desc="Creating node table") as progress_bar:
for file in os.listdir("./patents"):
if file.endswith(".xml"):
tree = ET.parse(os.path.join("./patents", file))
root = tree.getroot()
patent = ""
applicants = []
inventors = []
for field in root.findall('.//field[@name="{}"]'.format('title.lattes')):
patent = field.get('value').upper()
if df_temp['patent'].isin([patent]).any() == False:
df_temp = df_temp._append({'patent': patent}, ignore_index=True)
for field in root.findall('.//field[@name="{}"]'.format('inventor')):
inventors.append(field.get('value'))
for field in root.findall('.//field[@name="{}"]'.format('applicant')):
applicants.append(field.get('value'))
t1 = threading.Thread(target=savePatent, args=(patent, lock))
t2 = threading.Thread(target=saveApplicants, args=(applicants, lock))
t3 = threading.Thread(target=saveInventors, args=(inventors, lock))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
progress_bar.update(1)
# Save a CSV file
print("Saving CSV file...")
df.to_csv('./tables/nodeTable.csv', index = False)
print("Completed!")