Skip to content

Commit

Permalink
0.0.161
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jun 16, 2024
1 parent 74ab612 commit fab84bf
Show file tree
Hide file tree
Showing 14 changed files with 211 additions and 45 deletions.
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
lint:
python -m pip install --quiet --upgrade pycln isort black yamllint
# python -m yamllint .
python -m pip install --quiet --upgrade pycln isort ruff yamllint
# python -m yamllint .
python -m ruff check --fix --exit-zero
python -m pycln .
python -m isort .
python -m black .
python -m ruff format orso

update:
python -m pip install --quiet --upgrade -r requirements.txt
Expand Down
6 changes: 4 additions & 2 deletions orso/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(
first_dict = next(dicts)

# if we have an explicit schema, use that, otherwise guess from the first entry
self._schema = [str(k) for k in first_dict.keys()]
self._schema = [str(k) for k in first_dict]
self._row_factory = Row.create_class(self._schema)
keys = list(first_dict.keys())

Expand Down Expand Up @@ -189,7 +189,9 @@ def materialize(self):
def distinct(self) -> "DataFrame":
seen = set()
unique_rows = [
x for x in self._rows if hash(x) not in seen and not seen.add(hash(x)) # type:ignore
x
for x in self._rows
if hash(x) not in seen and not seen.add(hash(x)) # type:ignore
]
return DataFrame(rows=unique_rows, schema=self._schema)

Expand Down
11 changes: 2 additions & 9 deletions orso/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ def numpy_type_mapper(value):
return str(value)

def type_formatter(value, width, type_):

if isinstance(value, (numpy.generic, numpy.ndarray)):
value = numpy_type_mapper(value)

Expand Down Expand Up @@ -323,10 +322,7 @@ def _inner():
col_types = [column.type for column in t.schema.columns]
else:
col_types = [OrsoTypes._MISSING_TYPE] * len(t.schema)
if show_types:
col_type_width = list(map(len, col_types))
else:
col_type_width = [0] * len(col_types)
col_type_width = list(map(len, col_types)) if show_types else [0] * len(col_types)
col_width = [
min(max(cw, ctw, dw), max_column_width)
for cw, ctw, dw in zip(col_width, col_type_width, data_width)
Expand Down Expand Up @@ -382,10 +378,7 @@ def markdown(
max_column_width: int = 30,
): # pragma: no cover
# Extract head data
if limit > 0:
t = table.slice(length=limit)
else:
t = table
t = table.slice(length=limit) if limit > 0 else table

# width of index column
index_width = len(str(len(table)))
Expand Down
4 changes: 1 addition & 3 deletions orso/logging/create_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from orso.logging.log_formatter import LogFormatter

LOG_NAME: str = "DEFAULT"
LOG_FORMAT: str = (
"\001BOLD_CYANm%(name)s\001OFFm | %(levelname)-8s | %(asctime)s | \001PINKm%(funcName)s()\001OFFm | \001YELLOWm%(filename)s\001OFFm:\001PURPLEm%(lineno)s\001OFFm | %(message)s"
)
LOG_FORMAT: str = "\001BOLD_CYANm%(name)s\001OFFm | %(levelname)-8s | %(asctime)s | \001PINKm%(funcName)s()\001OFFm | \001YELLOWm%(filename)s\001OFFm:\001PURPLEm%(lineno)s\001OFFm | %(message)s"


def set_log_name(log_name: str):
Expand Down
9 changes: 5 additions & 4 deletions orso/profiler/distogram/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def bin_count(self):

# added for opteryx
def load(bins: list, minimum, maximum): # pragma: no cover

dgram = Distogram()
dgram.bins = bins
dgram.min = minimum
Expand Down Expand Up @@ -229,9 +228,11 @@ def _trim_in_place(
) -> Distogram:
current_value, current_frequency = distogram.bins[bin_index]
current_value = _caster(current_value)
distogram.bins[bin_index] = (current_value * current_frequency + new_value * new_count) / (
current_frequency + new_count
), current_frequency + new_count
distogram.bins[bin_index] = (
(current_value * current_frequency + new_value * new_count)
/ (current_frequency + new_count),
current_frequency + new_count,
)
_update_diffs(distogram, bin_index)
return distogram

Expand Down
17 changes: 2 additions & 15 deletions orso/profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@ def get_ordered_and_transitions(data) -> tuple:
transitions += 1
if ordered is None:
ordered = -1 if value < last_value else 1
elif value > last_value and ordered == -1:
ordered = 0
elif value < last_value and ordered == 1:
elif value > last_value and ordered == -1 or value < last_value and ordered == 1:
ordered = 0
last_value = value

Expand Down Expand Up @@ -150,7 +148,7 @@ def estimate_cardinality(self) -> int:
kth_min_value = self.kmv_hashes[-1]

# Cardinality estimation formula
return int((KVM_SIZE - 1) / ((kth_min_value / 2**32)))
return int((KVM_SIZE - 1) / (kth_min_value / 2**32))

def estimate_values_at(self, point) -> int:
if not hasattr(self, "distogram"):
Expand All @@ -168,7 +166,6 @@ def estimate_values_above(self, point) -> int:
return (self.count - self.missing) - distogram.count_at(self.distogram, point)

def __add__(self, profile: "ColumnProfile") -> "ColumnProfile":

new_profile = self.deep_copy()
new_profile.count += profile.count
new_profile.missing += profile.missing
Expand Down Expand Up @@ -217,7 +214,6 @@ def __add__(self, profile: "ColumnProfile") -> "ColumnProfile":


class TableProfile:

def __init__(self):
self._columns: List[ColumnProfile] = []
self._column_names: List[str] = []
Expand Down Expand Up @@ -290,7 +286,6 @@ def from_dataframe(cls, table) -> "TableProfile":
profiles = {}

for morsel in table.to_batches(25000):

if not isinstance(morsel.schema, RelationSchema):
morsel._schema = RelationSchema(
name="morsel", columns=[FlatColumn(name=c) for c in morsel.schema]
Expand Down Expand Up @@ -326,14 +321,12 @@ def __call__(self, column_data: List[Any]):

class ListStructProfiler(BaseProfiler):
def __call__(self, column_data: List[Any]):

self.profile.count = len(column_data)
self.profile.missing = sum(1 for val in column_data if val is None)


class DefaultProfiler(BaseProfiler):
def __call__(self, column_data: List[Any]):

self.profile.count = len(column_data)
self.profile.missing = sum(1 for val in column_data if val != val)

Expand All @@ -351,9 +344,7 @@ def __call__(self, column_data: List[Any]):


class NumericProfiler(BaseProfiler):

def __call__(self, column_data: List[Any]):

self.profile.count = len(column_data)
column_data = numpy.array(column_data, copy=False) # Ensure column_data is a NumPy array
if column_data.dtype.name == "object":
Expand Down Expand Up @@ -385,9 +376,7 @@ def __call__(self, column_data: List[Any]):


class VarcharProfiler(BaseProfiler):

def __call__(self, column_data: List[Any]):

self.profile.count = len(column_data)
column_data = [col for col in column_data if col is not None]
if len(column_data) > 0:
Expand All @@ -405,9 +394,7 @@ def __call__(self, column_data: List[Any]):


class DateProfiler(BaseProfiler):

def __call__(self, column_data: List[Any]):

self.profile.count = len(column_data)
if hasattr(column_data[0], "value"):
column_data = numpy.array(
Expand Down
2 changes: 1 addition & 1 deletion orso/row.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def serialize(value):
if isinstance(value, datetime.datetime):
return ("__datetime__", value.timestamp())
if isinstance(value, numpy.ndarray):
return list(value)
return value.tolist()
return str(value)

record_bytes = packb(tuple(self), option=OPT_SERIALIZE_NUMPY, default=serialize)
Expand Down
2 changes: 1 addition & 1 deletion orso/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
- Each column type provides a 'materialize' method to expand the compressed data
into its uncompressed form, facilitating query operations that require a full
column of data.
Column Types:
- SparseColumn: Handles sparse data by only storing non-default values.
- DictionaryColumn: Uses a dictionary to encode a finite set of string or
Expand Down
97 changes: 95 additions & 2 deletions orso/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from typing import Type
from typing import Union

import numpy

from orso.exceptions import MissingDependencyError


Expand Down Expand Up @@ -219,8 +221,7 @@ def report(self: Callable) -> str:
str: The formatted report string.
"""
stats = (
f"\nExecution Statistics for `{self.__name__}`\n "
f"Count : {self.count}\n" # type:ignore
f"\nExecution Statistics for `{self.__name__}`\n " f"Count : {self.count}\n" # type:ignore
)
if self.count > 0: # type:ignore
stats += f" Average : {sum(self._run_times) / self.count} seconds\n" # type:ignore
Expand Down Expand Up @@ -595,3 +596,95 @@ def random_string(width: int = 16) -> str:
rand_bytes = getrandbits(num_chars) # Generate random bits
# Convert to hex string, clip '0x' prefix, and zero-fill as needed
return ("000000" + hex(rand_bytes)[2:])[-width:]


def parse_iso(value):
# Date validation at speed is hard, dateutil is great but really slow, this is fast
# but error-prone. It assumes it is a date or it really nothing like a date.
# Making that assumption - and accepting the consequences - we can convert up to
# three times faster than dateutil.
#
# valid formats (not exhaustive):
#
# YYYY-MM-DD <- date
# YYYY-MM-DD HH:MM <- date and time, no seconds
# YYYY-MM-DDTHH:MM <- date and time, T separator
# YYYY-MM-DD HH:MM:SS <- date and time with seconds
# YYYY-MM-DD HH:MM:SS.mmmm <- date and time with milliseconds
#
# If the last character is a Z, we ignore it.
# If we can't parse as a date we return None rather than error
try:
input_type = type(value)
if input_type == str and value.isdigit():
value = int(value)
input_type = int

if input_type == numpy.datetime64:
# this can create dates rather than datetimes, so don't return yet
value = value.astype(datetime.datetime)
input_type = type(value)
if input_type is int:
value /= 1000000000

if input_type in (int, numpy.int64, float, numpy.float64):
return datetime.datetime.fromtimestamp(int(value), tz=datetime.timezone.utc).replace(
tzinfo=None
)

if input_type == datetime.datetime:
return value.replace(microsecond=0)
if input_type == datetime.date:
return datetime.datetime.combine(value, datetime.time.min)

# if we're here, we're doing string parsing
if input_type == str and 10 <= len(value) <= 33:
if value[-1] == "Z":
value = value[:-1]
if "+" in value:
value = value.split("+")[0]
if not 10 <= len(value) <= 28:
return None
val_len = len(value)
if value[4] != "-" or value[7] != "-":
return None
if val_len == 10:
# YYYY-MM-DD
return datetime.datetime(
*map(int, [value[:4], value[5:7], value[8:10]]) # type:ignore
)
if val_len >= 16:
if value[10] not in ("T", " ") and value[13] != ":":
return None
if val_len >= 19 and value[16] == ":":
# YYYY-MM-DD HH:MM:SS
return datetime.datetime(
*map( # type:ignore
int,
[
value[:4], # YYYY
value[5:7], # MM
value[8:10], # DD
value[11:13], # HH
value[14:16], # MM
value[17:19], # SS
],
)
)
if val_len == 16:
# YYYY-MM-DD HH:MM
return datetime.datetime(
*map( # type:ignore
int,
[
value[:4],
value[5:7],
value[8:10],
value[11:13],
value[14:16],
],
)
)
return None
except (ValueError, TypeError):
return None
23 changes: 23 additions & 0 deletions orso/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
import datetime
import decimal
from enum import Enum
from typing import Any

from orso.tools import parse_iso


class OrsoTypes(str, Enum):
Expand Down Expand Up @@ -54,6 +57,9 @@ def is_complex(self):
def __str__(self):
return self.value

def parse(self, value: Any) -> Any:
return ORSO_TO_PYTHON_PARSER[self.value](value)


ORSO_TO_PYTHON_MAP: dict = {
OrsoTypes.BOOLEAN: bool,
Expand All @@ -76,3 +82,20 @@ def __str__(self):
value: key for key, value in ORSO_TO_PYTHON_MAP.items() if key != OrsoTypes.BSON
}
PYTHON_TO_ORSO_MAP.update({tuple: OrsoTypes.ARRAY, set: OrsoTypes.ARRAY}) # map other python types

ORSO_TO_PYTHON_PARSER: dict = {
OrsoTypes.BOOLEAN: bool,
OrsoTypes.BLOB: bytes,
OrsoTypes.DATE: lambda x: parse_iso(x).date(),
OrsoTypes.TIMESTAMP: parse_iso,
OrsoTypes.TIME: lambda x: parse_iso(x).time(),
OrsoTypes.INTERVAL: datetime.timedelta,
OrsoTypes.STRUCT: dict,
OrsoTypes.DECIMAL: decimal.Decimal,
OrsoTypes.DOUBLE: float,
OrsoTypes.INTEGER: int,
OrsoTypes.ARRAY: list,
OrsoTypes.VARCHAR: str,
OrsoTypes.BSON: bytes,
OrsoTypes.NULL: lambda x: None,
}
2 changes: 1 addition & 1 deletion orso/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__: str = "0.0.160"
__version__: str = "0.0.161"
__author__: str = "@joocer"
13 changes: 13 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,16 @@ float_to_top = true
[build-system]
requires = ["setuptools>=42", "wheel", "Cython", "numpy"]
build-backend = "setuptools.build_meta"

[tool.ruff]
line-length = 100
indent-width = 4
target-version = 'py310'

[tool.ruff.format]
docstring-code-format = true
docstring-code-line-length = 100

[tool.ruff.lint]
select = ["SIM"]
ignore = []
5 changes: 1 addition & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@ def is_mac(): # pragma: no cover
return platform.system().lower() == "darwin"


if is_mac():
COMPILE_FLAGS = ["-O2"]
else:
COMPILE_FLAGS = ["-O2", "-march=native"]
COMPILE_FLAGS = ["-O2"] if is_mac() else ["-O2", "-march=native"]

__version__ = "notset"
with open(f"{LIBRARY}/version.py", mode="r") as v:
Expand Down
Loading

0 comments on commit fab84bf

Please sign in to comment.