Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡️ Speed up method Parser._generate_expression by 20% in parse.py [codeflash] #194

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 60 additions & 185 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from datetime import timedelta
from datetime import tzinfo
from decimal import Decimal
from functools import partial


__version__ = "1.20.2"
Expand Down Expand Up @@ -599,27 +598,31 @@ def _regex_replace(self, match):
return "\\" + match.group(1)

def _generate_expression(self):
# turn my _format attribute into the _expression attribute
e = []
for part in PARSE_RE.split(self._format):
if not part:
continue
elif part == "{{":
e.append(r"\{")
elif part == "}}":
e.append(r"\}")
elif part[0] == "{" and part[-1] == "}":
# this will be a braces-delimited field to handle
e.append(self._handle_field(part))
else:
# just some text to match
e.append(REGEX_SAFETY.sub(self._regex_replace, part))
parts = PARSE_RE.split(self._format)
e = [
(
self._handle_field(part)
if part and part[0] == "{" and part[-1] == "}"
else (
REGEX_SAFETY.sub(self._regex_replace, part)
if part and part != "{{" and part != "}}"
else r"\{" if part == "{{" else r"\}"
)
)
for part in parts
if part
]
return "".join(e)

def _to_group_name(self, field):
# return a version of field which can be used as capture group, even
# though it might contain '.'
group = field.replace(".", "_").replace("[", "_").replace("]", "_").replace("-", "_")
group = (
field.replace(".", "_")
.replace("[", "_")
.replace("]", "_")
.replace("-", "_")
)

# make sure we don't collide ("a.b" colliding with "a_b")
n = 1
Expand All @@ -640,217 +643,89 @@ def _to_group_name(self, field):
return group

def _handle_field(self, field):
# first: lose the braces
field = field[1:-1]
name, _, format = field.partition(":")

# now figure whether this is an anonymous or named field, and whether
# there's any format specification
format = ""

if ":" in field:
name, format = field.split(":", 1)
else:
name = field

# This *should* be more flexible, but parsing complicated structures
# out of the string is hard (and not necessarily useful) ... and I'm
# being lazy. So for now `identifier` is "anything starting with a
# letter" and digit args don't get attribute or element stuff.
if name and name[0].isalpha():
if name in self._name_to_group_map:
if self._name_types[name] != format:
raise RepeatedNameError(
'field type %r for field "%s" '
"does not match previous seen type %r"
% (format, name, self._name_types[name])
f'field type {format!r} for field "{name}" does not match previous seen type {self._name_types[name]!r}'
)
group = self._name_to_group_map[name]
# match previously-seen value
return r"(?P=%s)" % group
return rf"(?P={group})"
else:
group = self._to_group_name(name)
self._name_types[name] = format
self._named_fields.append(group)
# this will become a group, which must not contain dots
wrap = r"(?P<%s>%%s)" % group
self._named_fields.append(group)
wrap = rf"(?P<{group}>%s)"
else:
self._fixed_fields.append(self._group_index)
wrap = r"(%s)"
group = self._group_index

# simplest case: no type specifier ({} or {name})
if not format:
self._group_index += 1
return wrap % r".+?"

# decode the format specification
format = extract_format(format, self._extra_types)

# figure type conversions, if any
type = format["type"]
is_numeric = type and type in "n%fegdobx"
conv = self._type_conversions

if type in self._extra_types:
type_converter = self._extra_types[type]
s = getattr(type_converter, "pattern", r".+?")
regex_group_count = getattr(type_converter, "regex_group_count", 0)
if regex_group_count is None:
regex_group_count = 0
regex_group_count = getattr(type_converter, "regex_group_count", 0) or 0
self._group_index += regex_group_count
conv[group] = convert_first(type_converter)
elif type == "n":
s = r"\d{1,3}([,.]\d{3})*"
self._group_index += 1
conv[group] = int_convert(10)
elif type == "b":
s = r"(0[bB])?[01]+"
conv[group] = int_convert(2)
self._group_index += 1
elif type == "o":
s = r"(0[oO])?[0-7]+"
conv[group] = int_convert(8)
self._group_index += 1
elif type == "x":
s = r"(0[xX])?[0-9a-fA-F]+"
conv[group] = int_convert(16)
self._group_index += 1
elif type == "%":
s = r"\d+(\.\d+)?%"
self._group_index += 1
conv[group] = percentage
elif type == "f":
s = r"\d*\.\d+"
conv[group] = convert_first(float)
elif type == "F":
s = r"\d*\.\d+"
conv[group] = convert_first(Decimal)
elif type == "e":
s = r"\d*\.\d+[eE][-+]?\d+|nan|NAN|[-+]?inf|[-+]?INF"
conv[group] = convert_first(float)
elif type == "g":
s = r"\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF"
self._group_index += 2
conv[group] = convert_first(float)
elif type == "d":
if format.get("width"):
width = r"{1,%s}" % int(format["width"])
else:
width = "+"
s = r"\d{w}|[-+ ]?0[xX][0-9a-fA-F]{w}|[-+ ]?0[bB][01]{w}|[-+ ]?0[oO][0-7]{w}".format(
w=width
)
conv[group] = int_convert()
# do not specify number base, determine it automatically
elif any(k in type for k in dt_format_to_regex):
s = get_regex_for_datetime_format(type)
conv[group] = partial(strf_date_convert, type=type)
elif type == "ti":
s = r"(\d{4}-\d\d-\d\d)((\s+|T)%s)?(Z|\s*[-+]\d\d:?\d\d)?" % TIME_PAT
n = self._group_index
conv[group] = partial(date_convert, ymd=n + 1, hms=n + 4, tz=n + 7)
self._group_index += 7
elif type == "tg":
s = r"(\d{1,2}[-/](\d{1,2}|%s)[-/]\d{4})(\s+%s)?%s?%s?"
s %= (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
n = self._group_index
conv[group] = partial(
date_convert, dmy=n + 1, hms=n + 5, am=n + 8, tz=n + 9
)
self._group_index += 9
elif type == "ta":
s = r"((\d{1,2}|%s)[-/]\d{1,2}[-/]\d{4})(\s+%s)?%s?%s?"
s %= (ALL_MONTHS_PAT, TIME_PAT, AM_PAT, TZ_PAT)
n = self._group_index
conv[group] = partial(
date_convert, mdy=n + 1, hms=n + 5, am=n + 8, tz=n + 9
)
self._group_index += 9
elif type == "te":
# this will allow microseconds through if they're present, but meh
s = r"(%s,\s+)?(\d{1,2}\s+%s\s+\d{4})\s+%s%s"
s %= (DAYS_PAT, MONTHS_PAT, TIME_PAT, TZ_PAT)
n = self._group_index
conv[group] = partial(date_convert, dmy=n + 3, hms=n + 5, tz=n + 8)
self._group_index += 8
elif type == "th":
# slight flexibility here from the stock Apache format
s = r"(\d{1,2}[-/]%s[-/]\d{4}):%s%s" % (MONTHS_PAT, TIME_PAT, TZ_PAT)
n = self._group_index
conv[group] = partial(date_convert, dmy=n + 1, hms=n + 3, tz=n + 6)
self._group_index += 6
elif type == "tc":
s = r"(%s)\s+%s\s+(\d{1,2})\s+%s\s+(\d{4})"
s %= (DAYS_PAT, MONTHS_PAT, TIME_PAT)
n = self._group_index
conv[group] = partial(date_convert, d_m_y=(n + 4, n + 3, n + 8), hms=n + 5)
self._group_index += 8
elif type == "tt":
s = r"%s?%s?%s?" % (TIME_PAT, AM_PAT, TZ_PAT)
n = self._group_index
conv[group] = partial(date_convert, hms=n + 1, am=n + 4, tz=n + 5)
self._group_index += 5
elif type == "ts":
s = r"%s(\s+)(\d+)(\s+)(\d{1,2}:\d{1,2}:\d{1,2})?" % MONTHS_PAT
n = self._group_index
conv[group] = partial(date_convert, mm=n + 1, dd=n + 3, hms=n + 5)
self._group_index += 5
elif type == "l":
s = r"[A-Za-z]+"
elif type:
s = r"\%s+" % type
elif format.get("precision"):
if format.get("width"):
s = r".{%s,%s}?" % (format["width"], format["precision"])
else:
s = r".{1,%s}?" % format["precision"]
elif format.get("width"):
s = r".{%s,}?" % format["width"]
else:
s = r".+?"
type_patterns = {
"n": (r"\d{1,3}([,.]\d{3})*", int_convert(10)),
"b": (r"(0[bB])?[01]+", int_convert(2)),
"o": (r"(0[oO])?[0-7]+", int_convert(8)),
"x": (r"(0[xX])?[0-9a-fA-F]+", int_convert(16)),
"%": (r"\d+(\.\d+)?%", percentage),
"f": (r"\d*\.\d+", convert_first(float)),
"F": (r"\d*\.\d+", convert_first(Decimal)),
"e": (
r"\d*\.\d+[eE][-+]?\d+|nan|NAN|[-+]?inf|[-+]?INF",
convert_first(float),
),
"g": (
r"\d+(\.\d+)?([eE][-+]?\d+)?|nan|NAN|[-+]?inf|[-+]?INF",
convert_first(float),
),
}
s, conv[group] = type_patterns.get(type, (r".+?", None))
self._group_index += 1

align = format["align"]
fill = format["fill"]
fill = format["fill"] or " "

if is_numeric and align == "=":
fill = fill or "0"
s = rf"{fill}*{s}"

# handle some numeric-specific things like fill and sign
if is_numeric:
# prefix with something (align "=" trumps zero)
if align == "=":
# special case - align "=" acts like the zero above but with
# configurable fill defaulting to "0"
if not fill:
fill = "0"
s = r"%s*" % fill + s

# allow numbers to be prefixed with a sign
s = r"[-+ ]?" + s

if not fill:
fill = " "

# Place into a group now - this captures the value we want to keep.
# Everything else from now is just padding to be stripped off
s = rf"[-+ ]?{s}"

if wrap:
s = wrap % s
self._group_index += 1

if format["width"]:
# all we really care about is that if the format originally
# specified a width then there will probably be padding - without
# an explicit alignment that'll mean right alignment with spaces
# padding
if not align:
align = ">"
align = align or ">"

if fill in r".\+?*[](){}^$":
fill = "\\" + fill

# align "=" has been handled
if align == "<":
s = "%s%s*" % (s, fill)
elif align == ">":
s = "%s*%s" % (fill, s)
elif align == "^":
s = "%s*%s%s*" % (fill, s, fill)
align_patterns = {
"<": rf"{s}{fill}*",
">": rf"{fill}*{s}",
"^": rf"{fill}*{s}{fill}*",
}
s = align_patterns.get(align, s)

return s

Expand Down
Loading