-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgoogle-sames-diffs.py
146 lines (128 loc) · 4.43 KB
/
google-sames-diffs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import sys
import csv
import json
import time
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv
from pipeline.config import Config
load_dotenv()
basepath = os.getenv("LUX_BASEPATH", "")
cfgs = Config(basepath=basepath)
idmap = cfgs.get_idmap()
cfgs.cache_globals()
cfgs.instantiate_all()
# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
gidfn = os.path.join(cfgs.data_dir, "google_sheet_id.txt")
with open(gidfn) as fh:
SPREADSHEET_ID = fh.read().strip()
diffs = []
sames = []
SHEET_NAMES = [["Different From", diffs], ["Same As", sames]]
RANGE_START = 2
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
### FIXME: These should use config.data_dir
tokfn = os.path.join(cfgs.data_dir, "token.json")
credfn = os.path.join(cfgs.data_dir, "credentials.json")
if os.path.exists(tokfn):
creds = Credentials.from_authorized_user_file(tokfn, SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(credfn, SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open(tokfn, "w") as token:
token.write(creds.to_json())
try:
service = build("sheets", "v4", credentials=creds)
sheet = service.spreadsheets()
for sn, my_map in SHEET_NAMES:
page = RANGE_START
rng = f"A{page}:B{page+500}"
RANGE = f"{sn}!{rng}"
cont = True
while cont:
result = sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=RANGE).execute()
values = result.get("values", [])
if not values:
cont = False
else:
for row in values:
uria, urib = row
# Canonicalize URIs
uriaf = cfgs.canonicalize(uria)
uribf = cfgs.canonicalize(urib)
if not uriaf:
print(f"Failed to canonicalize {uria}")
elif not uribf:
print(f"Failed to canonicalize {urib}")
else:
my_map.append([uriaf, uribf])
page += 500
rng = f"A{page}:B{page+500}"
RANGE = f"{sn}!{rng}"
except HttpError as err:
print(f"Trapped error: {err}")
# Now write the dicts to CSVs
dfn = os.path.join(cfgs.data_dir, "differentFrom/google.csv")
with open(dfn, "w") as fh:
writer = csv.writer(fh)
for r in diffs:
writer.writerow(r)
sfn = os.path.join(cfgs.data_dir, "sameAs/google.csv")
with open(sfn, "w") as fh:
writer = csv.writer(fh)
for r in sames:
writer.writerow(r)
# Later we'll call load-csv-map2.py on them
### Now fetch the Fixes sheet
fixes = []
page = RANGE_START
rng = f"A{page}:G{page+500}"
RANGE = f"Fixes!{rng}"
cont = True
while cont:
result = sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=RANGE).execute()
values = result.get("values", [])
if not values:
cont = False
else:
for row in values:
try:
src, ident, clss, equiv, path, op = row[:6]
if len(row) == 7:
arg = row[6]
else:
arg = ""
except:
print("FAILED to split row from Google Sheet")
print(row)
cont = False
fixes.append(
{
"source": src.strip(),
"identifier": ident.strip(),
"class": clss.strip(),
"equivalent": equiv.strip(),
"path": path.strip(),
"operation": op.strip(),
"argument": arg.strip(),
}
)
page += 500
rng = f"A{page}:G{page+500}"
RANGE = f"Fixes!{rng}"
dfn = os.path.join(cfgs.data_dir, "xpath_fixes.json")
with open(dfn, "w") as ofh:
ofh.write(json.dumps(fixes))