-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathQueryGraph.py
133 lines (107 loc) · 4.47 KB
/
QueryGraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import DataLoader
import pandas as pd
import Select
from collections import defaultdict
class QueryGraph:
def __init__(self, tables, joins, selects, test=False):
assert isinstance(tables, dict) and all(isinstance(t[0], str) and isinstance(t[1], str) for t in tables.items())
assert isinstance(joins, list) and all(isinstance(j, str) for j in joins)
assert isinstance(selects, list) and all(isinstance(s, str) for s in selects)
selects_for_relation = defaultdict(list)
for s in selects:
print('Building Select:', s)
ss = Select.remove_outer_parentheses(str(s))
table_abr = ss.split('.')[0]
selects_for_relation[table_abr].append(ss)
print('\nRelations:')
self.V = dict()
for (k, v) in tables.items():
# Load the data
print('Loading', (k, v))
if test:
print('THIS IS A TEST!!!')
df = pd.DataFrame()
else:
df = DataLoader.load_pickle(v)
df.columns = [k + '_' + c for c in DataLoader.columns[v]]
# Perform selections on the data
for s in selects_for_relation[k]:
df = Select.perform_selection(df, s)
df.relation_name = k
# Create a relation node
r = Relation(df)
self.V[k] = r
# Create the predicate edges
# self.E = {v: set() for v in self.V}
joins = [j.replace(' ', '').split('=') for j in joins]
joins = [(t1.split('.'), t2.split('.')) for t1, t2 in joins]
for t1, t2 in joins:
assert t1[0] in self.V.keys() and t2[0] in self.V.keys()
# Add edges between them
r1, r2 = self.V[t1[0]], self.V[t2[0]]
assert isinstance(r1, Relation) and isinstance(r2, Relation)
# set r2 as neighbor of r1
if r2 in r1.neighbors.keys():
r1.neighbors[r2].add((t1[1], t2[1]))
# self.E[r1].add(r2)
else:
r1.neighbors[r2] = {(t1[1], t2[1])}
# self.E[r1] = {r2}
# set r1 as neighbor of r2
if r1 in r2.neighbors.keys():
r2.neighbors[r1].add((t2[1], t1[1]))
# self.E[r2].add(r1)
else:
r2.neighbors[r1] = {(t2[1], t1[1])}
# self.E[r2] = {r1}
# Print all the neighbors
print('\nNeighbors:')
for k, v in self.V.items():
print(k, ':', v.neighbors)
def get_relations(self):
return self.V
def get_neighbors(self, R_set):
assert isinstance(R_set, frozenset) and all(isinstance(r, Relation) for r in R_set)
# print([n for r in R_set for n in r.neighbors])
# return set().union(*[self.E.get(r, set()) for r in R_set]).difference(R_set)
return set().union({n for r in R_set for n in r.neighbors}).difference(R_set)
class Relation:
def __init__(self, df):
assert isinstance(df, pd.DataFrame) and hasattr(df, 'relation_name')
self.df = df
self.neighbors = dict()
def _has_index(self, others):
assert isinstance(others, frozenset) and all(isinstance(r, Relation) for r in others)
x = {r: self.neighbors[r] for r in self.neighbors.keys() & set(others)}
return len(x) > 0, x
def get_index(self, others):
_, x = self._has_index(others)
return x
def has_index(self, others):
has_ix, _ = self._has_index(others)
return has_ix
def sample_table(self, n):
assert isinstance(n, int)
if n > self.df.shape[0]:
return self.df
x = self.df.sample(n)
x.relation_name = self.df.relation_name
return x
def __setattr__(self, key, value):
if key in self.__dict__:
raise AttributeError('Cannot change constant attribute')
self.__dict__[key] = value
def __len__(self):
return len(self.df.index)
def __hash__(self):
return hash(self.df.relation_name) ^ hash(frozenset(self.df.index)) # ^ hash(self.neighbors)
def __eq__(self, other):
return self.df.equals(other.df) and self.neighbors == other.neighbors
def __ne__(self, other):
# Not strictly necessary, but to avoid having both x==y and x!=y
# True at the same time
return not (self == other)
def __str__(self):
return self.df.relation_name
def __repr__(self):
return self.df.relation_name