-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_set.py
234 lines (193 loc) · 10.9 KB
/
data_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import re
import typing
from .attribute import Attribute
from .entity import Entity, EntityLink
from .metric import NumberFormat, Aggregation, SimpleMetric, ComposedMetric
class DataSet():
def __init__(self, entity: Entity, name: str):
"""
An entity with its metrics and recursively linked entities.
Args:
entity: The underlying entity with its attributes and linked other entities
name: The name of the data set.
"""
self.entity = entity
self.name = name
self.entity.data_set = self
self.metrics = {}
self.excluded_paths = set()
self.included_attributes = {}
self.excluded_attributes = {}
def __repr__(self) -> str:
return f'<DataSet "{self.entity.name}">'
def add_simple_metric(self, name: str, description: str, column_name: str, aggregation: Aggregation,
important_field: bool = False,
number_format: NumberFormat = NumberFormat.STANDARD,
more_url: typing.Optional[str] = None,
):
"""
Add a metric that is computed as a direct aggregation on a entity table column
Args:
name: How the metric is displayed in front-ends, e.g. "Revenue after cancellations"
description: A meaningful business definition of the metric
column_name: The column that the aggregation is based on
aggregation: The aggregation method to use
important_field: It refers to key business metrics.
number_format: The way to format a string. Defaults to NumberFormat.STANDARD.
more_url: URL (as string) which should be appended as a `more...` link in the UI.
"""
if name in self.metrics:
raise ValueError(f'Metric "{name}" already exists in data set "{self.name}"')
self.metrics[name] = SimpleMetric(
name=name,
description=description,
data_set=self,
column_name=column_name,
aggregation=aggregation,
important_field=important_field,
number_format=number_format,
more_url=more_url,
)
def add_composed_metric(self, name: str, description: str, formula: str, important_field: bool = False,
number_format: NumberFormat = NumberFormat.STANDARD,
more_url: typing.Optional[str] = None,
):
"""
Add a metric that is based on a list of simple metrics.
Args:
name: How the metric is displayed in front-ends, e.g. "Revenue after cancellations"
description: A meaningful business definition of the metric
formula: How to compute the metric. Examples: [Metric A] + [Metric B], [Metric A] / ([Metric B] + [Metric C])
important_field: It refers to key business metrics.
number_format: The way to format a string. Defaults to NumberFormat.STANDARD.
more_url: URL (as string) which should be appended as a `more...` link in the UI.
"""
if name in self.metrics:
raise ValueError(f'Metric "{name}" already exists in data set "{self.name}"')
# ' [a] \n + [b]' -> '[a] + [b]'
formula_cleaned = re.sub("\s\s+", " ", formula.strip().replace('\n', ''))
# split '[a] + [b]' -> ['', 'a', ' + ', 'b', '']
formula_split = re.split(r'\[(.*?)\]', formula_cleaned)
parent_metrics = []
for metric_name in formula_split[1::2]: # 1::2 start at second, take every 2nd,
if metric_name not in self.metrics:
raise ValueError(f'Could not find metric "{metric_name}" in data set "{self.name}"')
parent_metrics.append(self.metrics[metric_name])
self.metrics[name] = ComposedMetric(name=name,
description=description,
data_set=self,
parent_metrics=parent_metrics,
formula_template='{}'.join(formula_split[0::2]),
important_field=important_field,
number_format=number_format,
more_url=more_url,
)
_PathSpec = typing.TypeVar('_PathSpec', typing.Sequence[typing.Union[str, typing.Tuple[str, str]]], bytes)
def _parse_path(self, entity: Entity, path: _PathSpec) -> typing.Union[
typing.Tuple, typing.Tuple[EntityLink, EntityLink]]:
"""
Helper function for parsing path specifications into a tuple of entity link instances
Args:
entity: the entity for which to resolve the entity links
path: How to get to the entity from the entity of the data set.
A list of either strings (target entity names) or tuples of strings (target entity name + prefix).
Example: ['Entity 1', ('Entity 2', 'Prefix'), 'Entity 3']
"""
if not path:
return ()
if not (isinstance(path[0], str) or (isinstance(path[0], tuple) and len(path[0]) == 2)):
raise TypeError(f'Expecting a string or a tuple of two strings, got: {path[0]}')
target_entity_name, prefix = (path[0], None) if isinstance(path[0], str) else path[0]
entity_link = entity.find_entity_link(target_entity_name, prefix)
return (entity_link,) + self._parse_path(entity_link.target_entity, path[1::])
def exclude_path(self, path: _PathSpec):
"""
Exclude a connected entity from generated data set tables by specifying the entity links to that entity
Args:
path: How to get to the entity from the data set entity.
A list of either strings (target entity names) or tuples of strings (target entity name + prefix).
Example: ['Entity 1', ('Entity 2', 'Prefix'), 'Entity 3']
"""
self.excluded_paths.add(self._parse_path(self.entity, path))
def exclude_attributes(self, path: _PathSpec, attribute_names: [str] = None):
"""
Exclude attributes of a connected entity in generated data set tables.
Args:
path: How to get to the entity from the data set entity.
A list of either strings (target entity names) or tuples of strings (target entity name + prefix).
Example: ['Entity 1', ('Entity 2', 'Prefix'), 'Entity 3']
attribute_names: A list of name of attributes to be excluded. If not provided, then exclude all attributes
"""
entity_links = self._parse_path(self.entity, path)
entity = entity_links[-1].target_entity
if not attribute_names:
self.excluded_attributes[entity_links] = entity.attributes
else:
self.excluded_attributes[entity_links] = [entity.find_attribute(attribute_name) for attribute_name in
attribute_names]
def include_attributes(self, path: _PathSpec, attribute_names: [str]):
"""
Exclude all attributes except the explicitly included ones of a connected entity in generated data set tables.
Args:
path: How to get to the entity from the data set entity.
A list of either strings (target entity names) or tuples of strings (target entity name + prefix).
Example: ['Entity 1', ('Entity 2', 'Prefix'), 'Entity 3']
attribute_names: A list of name of attributes to be included.
"""
entity_links = self._parse_path(self.entity, path)
self.included_attributes[entity_links] = [entity_links[-1].target_entity.find_attribute(attribute_name) for
attribute_name in attribute_names]
def paths_to_connected_entities(self) -> [(EntityLink,)]:
"""
Get all possible paths to connected entities (tuples of entity links)
- that are not explicitly excluded
- that are are not beyond the max link depth or that are explicitly included
"""
paths = []
def _append_path_including_subpaths(paths, path) -> typing.List[typing.Tuple[EntityLink]]:
"""Append a path and its subpaths to the list of paths, if they do not already exist. A subpath always starts
at the beginning of the path: (1,2,3) -> (1,), (1,2), (1,2,3)
"""
for i in range(len(path)):
if path[:i + 1] not in paths:
paths.append(path[:i + 1])
return paths
def traverse_graph(entity: Entity, current_path: tuple):
for entity_link in entity.entity_links:
path = current_path + (entity_link,)
if (entity_link not in current_path # check for circles in path
and (path not in self.excluded_paths) # explicitly excluded paths
):
_append_path_including_subpaths(paths, path)
traverse_graph(entity_link.target_entity, path)
traverse_graph(self.entity, ())
return paths
def connected_attributes(self, include_personal_data: bool = True) -> {(EntityLink,): {str: Attribute}}:
"""
Returns all attributes with their prefixed name from all connected entities.
Args:
include_personal_data: If False, then exclude fields that are marked as personal data
Returns:
A dictionary with the paths as keys and dictionaries of prefixed attribute names and
attributes as values. Example:
{(<EntityLink 1>, <EntityLink 2): {'Prefixed attribute 1 name': <Attribute 1>,
'Prefixed attribute 2 name': <Attribute 2>},
..}
"""
result = {(): {attribute.prefixed_name(): attribute for attribute in self.entity.attributes}}
for path in self.paths_to_connected_entities():
result[path] = {}
entity = path[-1].target_entity
for attribute in entity.attributes:
if ((path in self.included_attributes and attribute in self.included_attributes[path])
or (path not in self.included_attributes)) \
and ((path in self.excluded_attributes and attribute not in self.excluded_attributes[path])
or (path not in self.excluded_attributes)) \
and attribute.accessible_via_entity_link and (
include_personal_data or not attribute.personal_data):
result[path][attribute.prefixed_name(path)] = attribute
return result
def id(self):
"""Returns a representation that can be used in urls"""
from html import escape
return escape(self.name.replace(' ', '_').lower())