-
Notifications
You must be signed in to change notification settings - Fork 38
/
release_table.py
212 lines (174 loc) · 10.5 KB
/
release_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import logging
import re
import sys
from datetime import datetime
from re import Match
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
from liquid import Template
"""Fetch release-level data from an HTML table in a web page.
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
necessary information. Available configuration options are:
- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page.
- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page.
- header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row.
- rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows.
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
- render_javascript_click_selector (optional, default = None): A playwright selector used to click on an element after
the JavaScript rendering. Only use when render_javascript is true.
- render_javascript_wait_until (optional, default = None): Argument to pass to Playwright, one of "commit",
"domcontentloaded", "load", or "networkidle". Only use when render_javascript is true and if the script fails without it.
- ignore_empty_releases (optional, default = false): A boolean value indicating whether to ignore releases with no
fields except the name.
- fields: A dictionary that maps release fields to the table's columns. Field definition include:
- column (mandatory): The name or index (starts at 1) of the column in the table.
- type (mandatory, default = string): The type of the field. Supported types are:
- string: The raw string value.
- date : A full or year-month date (supported patterns available in common.dates).
- range : Convert a comma-separated list of values into a range, only keeping the first and last value.
For example, "1.0, 1.1, 1.2" becomes "1.0 - 1.2".
If the field is one of the known date fields, the type is automatically set to 'date' if not provided.
- regex (mandatory, default = [DEFAULT_REGEX]): A regular expression, or a list of regular expressions, used to
validate allowed values. Note that default value for the releaseCycle field is not DEFAULT_REGEX, but
DEFAULT_RELEASE_REGEX.
- regex_exclude (mandatory, default = []): A regular expression, or a list of regular expressions, used to exclude
values even if they match any regular expression in 'regex'.
- template (mandatory, default = DEFAULT_TEMPLATE): A liquid template used to clean up the value using the matched
groups from a 'regex'.
Note that defining the column attribute directly instead of its full definition is allowed when
there the column name or index is the only attribute. For example, this:
```
fields:
releaseCycle:
column: "End of life"
```
can be replaced with this:
```
fields:
releaseCycle: "End of life"
```
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
METHOD = "release_table"
SUPPORTED_TYPES = ["date", "string", "range"]
DATE_TYPES = ["date"]
DATE_FIELDS = ["releaseDate", "lts", "eoas", "eol", "eoes"]
DEFAULT_REGEX = r"^(?P<value>.+)$"
DEFAULT_TEMPLATE = "{{value}}"
DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)*)$"
RANGE_LIST_SEPARATOR_PATTERN = re.compile(r"\s*,\s*")
TODAY = dates.today()
class Field:
def __init__(self, name: str, definition: str | dict) -> None:
# Directly specifying the column name or index instead of its full definition is allowed.
# In this case we must convert it to a full definition.
if isinstance(definition, (str, int)):
definition = {"column": definition}
self.name = name
if self.name == "releaseCycle":
definition["type"] = "string"
definition["regex"] = definition.get("regex", [DEFAULT_RELEASE_REGEX])
definition["template"] = definition.get("template", DEFAULT_TEMPLATE)
self.is_index = isinstance(definition["column"], int)
if self.is_index:
self.column = definition["column"] - 1 # convert to 0-based index
else:
self.column = definition["column"].lower()
self.type = definition.get("type", "string")
if self.name in DATE_FIELDS and self.type not in DATE_TYPES:
self.type = "date" # override type for known date fields
elif self.type not in SUPPORTED_TYPES:
msg = f"unsupported type: {self.type} for field {self.name}"
raise ValueError(msg)
regex = definition.get("regex", [DEFAULT_REGEX])
regex = regex if isinstance(regex, list) else [regex]
self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex]
exclude_regex = definition.get("regex_exclude", [])
exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex]
self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex]
self.template = Template(definition.get("template", DEFAULT_TEMPLATE)) \
if "template" in definition or regex else None
def extract_from(self, raw_value: str) -> str | datetime | None:
for exclude_pattern in self.exclude_version_patterns:
if exclude_pattern.match(raw_value):
return None
for include_pattern in self.include_version_patterns:
match = include_pattern.match(raw_value)
if not match:
continue
return self.__process_value(match, raw_value)
if self.name == "releaseCycle":
return None # skipping entire rows is allowed
msg = f"field {self}'s value '{raw_value}' does not match any regex in {self.include_version_patterns}"
raise ValueError(msg)
def __process_value(self, match: Match[str], raw_value: str) -> str | datetime:
str_value = self.template.render(**match.groupdict()) if self.template else raw_value
if self.type == "date":
try:
return dates.parse_date(str_value)
except ValueError:
return dates.parse_month_year_date(str_value)
elif self.type == "range":
items = RANGE_LIST_SEPARATOR_PATTERN.split(str_value)
return f"{items[0]} - {items[-1]}" if len(items) > 1 else str_value
return str_value
def __repr__(self) -> str:
return f"{self.name}({self.column})"
p_filter = sys.argv[1] if len(sys.argv) > 1 else None
m_filter = sys.argv[2] if len(sys.argv) > 2 else None
for config in endoflife.list_configs(p_filter, METHOD, m_filter):
with releasedata.ProductData(config.product) as product_data:
render_javascript = config.data.get("render_javascript", False)
render_javascript_click_selector = config.data.get("render_javascript_click_selector", None)
render_javascript_wait_until = config.data.get("render_javascript_wait_until", None)
ignore_empty_releases = config.data.get("ignore_empty_releases", False)
header_row_selector = config.data.get("header_selector", "thead tr")
rows_selector = config.data.get("rows_selector", "tbody tr")
cells_selector = "td, th"
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
if render_javascript:
response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector,
wait_until=render_javascript_wait_until)
else:
response_text = http.fetch_url(config.url).text
soup = BeautifulSoup(response_text, features="html5lib")
for table in soup.select(config.data["selector"]):
header_row = table.select_one(header_row_selector)
if not header_row:
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
continue
headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)]
logging.info(f"processing table with headers {headers}")
try:
fields_index = {"releaseCycle": headers.index(release_cycle_field.column)}
for field in fields:
fields_index[field.name] = field.column if field.is_index else headers.index(field.column)
min_column_count = max(fields_index.values()) + 1
for row in table.select(rows_selector):
cells = [cell.get_text().strip() for cell in row.select(cells_selector)]
if len(cells) < min_column_count:
logging.info(f"skipping row {cells}: not enough columns")
continue
raw_release_name = cells[fields_index[release_cycle_field.name]]
release_name = release_cycle_field.extract_from(raw_release_name)
if not release_name:
logging.info(f"skipping row {cells}: invalid release cycle '{raw_release_name}', "
f"should match one of {release_cycle_field.include_version_patterns} "
f"and not match all of {release_cycle_field.exclude_version_patterns}")
continue
release = product_data.get_release(release_name)
for field in fields:
raw_field = cells[fields_index[field.name]]
try:
release.set_field(field.name, field.extract_from(raw_field))
except ValueError as e:
logging.info(f"skipping cell {raw_field} for {release}: {e}")
if ignore_empty_releases and release.is_empty():
logging.info(f"removing empty release '{release}'")
product_data.remove_release(release_name)
if release.is_released_after(TODAY):
logging.info(f"removing future release '{release}'")
product_data.remove_release(release_name)
except ValueError as e:
logging.info(f"skipping table with headers {headers}: {e}")