0.0.161

mabel-dev · Jun 16, 2024 · fab84bf · fab84bf
1 parent 74ab612
commit fab84bf
Show file tree

Hide file tree

Showing 14 changed files with 211 additions and 45 deletions.
diff --git a/Makefile b/Makefile
@@ -1,9 +1,10 @@
 lint:
-	python -m pip install --quiet --upgrade pycln isort black yamllint
-	# python -m yamllint .
+	python -m pip install --quiet --upgrade pycln isort ruff yamllint
+#	python -m yamllint .
+	python -m ruff check --fix --exit-zero
 	python -m pycln .
 	python -m isort .
-	python -m black .
+	python -m ruff format orso
 
 update:
 	python -m pip install --quiet --upgrade -r requirements.txt

diff --git a/orso/dataframe.py b/orso/dataframe.py
@@ -75,7 +75,7 @@ def __init__(
             first_dict = next(dicts)
 
             # if we have an explicit schema, use that, otherwise guess from the first entry
-            self._schema = [str(k) for k in first_dict.keys()]
+            self._schema = [str(k) for k in first_dict]
             self._row_factory = Row.create_class(self._schema)
             keys = list(first_dict.keys())
 
@@ -189,7 +189,9 @@ def materialize(self):
     def distinct(self) -> "DataFrame":
         seen = set()
         unique_rows = [
-            x for x in self._rows if hash(x) not in seen and not seen.add(hash(x))  # type:ignore
+            x
+            for x in self._rows
+            if hash(x) not in seen and not seen.add(hash(x))  # type:ignore
         ]
         return DataFrame(rows=unique_rows, schema=self._schema)
 

diff --git a/orso/display.py b/orso/display.py
@@ -206,7 +206,6 @@ def numpy_type_mapper(value):
         return str(value)
 
     def type_formatter(value, width, type_):
-
         if isinstance(value, (numpy.generic, numpy.ndarray)):
             value = numpy_type_mapper(value)
 
@@ -323,10 +322,7 @@ def _inner():
             col_types = [column.type for column in t.schema.columns]
         else:
             col_types = [OrsoTypes._MISSING_TYPE] * len(t.schema)
-        if show_types:
-            col_type_width = list(map(len, col_types))
-        else:
-            col_type_width = [0] * len(col_types)
+        col_type_width = list(map(len, col_types)) if show_types else [0] * len(col_types)
         col_width = [
             min(max(cw, ctw, dw), max_column_width)
             for cw, ctw, dw in zip(col_width, col_type_width, data_width)
@@ -382,10 +378,7 @@ def markdown(
     max_column_width: int = 30,
 ):  # pragma: no cover
     # Extract head data
-    if limit > 0:
-        t = table.slice(length=limit)
-    else:
-        t = table
+    t = table.slice(length=limit) if limit > 0 else table
 
     # width of index column
     index_width = len(str(len(table)))

diff --git a/orso/logging/create_logger.py b/orso/logging/create_logger.py
@@ -9,9 +9,7 @@
 from orso.logging.log_formatter import LogFormatter
 
 LOG_NAME: str = "DEFAULT"
-LOG_FORMAT: str = (
-    "\001BOLD_CYANm%(name)s\001OFFm | %(levelname)-8s | %(asctime)s | \001PINKm%(funcName)s()\001OFFm | \001YELLOWm%(filename)s\001OFFm:\001PURPLEm%(lineno)s\001OFFm | %(message)s"
-)
+LOG_FORMAT: str = "\001BOLD_CYANm%(name)s\001OFFm | %(levelname)-8s | %(asctime)s | \001PINKm%(funcName)s()\001OFFm | \001YELLOWm%(filename)s\001OFFm:\001PURPLEm%(lineno)s\001OFFm | %(message)s"
 
 
 def set_log_name(log_name: str):

diff --git a/orso/profiler/distogram/__init__.py b/orso/profiler/distogram/__init__.py
@@ -127,7 +127,6 @@ def bin_count(self):
 
 # added for opteryx
 def load(bins: list, minimum, maximum):  # pragma: no cover
-
     dgram = Distogram()
     dgram.bins = bins
     dgram.min = minimum
@@ -229,9 +228,11 @@ def _trim_in_place(
 ) -> Distogram:
     current_value, current_frequency = distogram.bins[bin_index]
     current_value = _caster(current_value)
-    distogram.bins[bin_index] = (current_value * current_frequency + new_value * new_count) / (
-        current_frequency + new_count
-    ), current_frequency + new_count
+    distogram.bins[bin_index] = (
+        (current_value * current_frequency + new_value * new_count)
+        / (current_frequency + new_count),
+        current_frequency + new_count,
+    )
     _update_diffs(distogram, bin_index)
     return distogram
 

diff --git a/orso/profiler/profiler.py b/orso/profiler/profiler.py
@@ -109,9 +109,7 @@ def get_ordered_and_transitions(data) -> tuple:
             transitions += 1
             if ordered is None:
                 ordered = -1 if value < last_value else 1
-            elif value > last_value and ordered == -1:
-                ordered = 0
-            elif value < last_value and ordered == 1:
+            elif value > last_value and ordered == -1 or value < last_value and ordered == 1:
                 ordered = 0
         last_value = value
 
@@ -150,7 +148,7 @@ def estimate_cardinality(self) -> int:
         kth_min_value = self.kmv_hashes[-1]
 
         # Cardinality estimation formula
-        return int((KVM_SIZE - 1) / ((kth_min_value / 2**32)))
+        return int((KVM_SIZE - 1) / (kth_min_value / 2**32))
 
     def estimate_values_at(self, point) -> int:
         if not hasattr(self, "distogram"):
@@ -168,7 +166,6 @@ def estimate_values_above(self, point) -> int:
         return (self.count - self.missing) - distogram.count_at(self.distogram, point)
 
     def __add__(self, profile: "ColumnProfile") -> "ColumnProfile":
-
         new_profile = self.deep_copy()
         new_profile.count += profile.count
         new_profile.missing += profile.missing
@@ -217,7 +214,6 @@ def __add__(self, profile: "ColumnProfile") -> "ColumnProfile":
 
 
 class TableProfile:
-
     def __init__(self):
         self._columns: List[ColumnProfile] = []
         self._column_names: List[str] = []
@@ -290,7 +286,6 @@ def from_dataframe(cls, table) -> "TableProfile":
         profiles = {}
 
         for morsel in table.to_batches(25000):
-
             if not isinstance(morsel.schema, RelationSchema):
                 morsel._schema = RelationSchema(
                     name="morsel", columns=[FlatColumn(name=c) for c in morsel.schema]
@@ -326,14 +321,12 @@ def __call__(self, column_data: List[Any]):
 
 class ListStructProfiler(BaseProfiler):
     def __call__(self, column_data: List[Any]):
-
         self.profile.count = len(column_data)
         self.profile.missing = sum(1 for val in column_data if val is None)
 
 
 class DefaultProfiler(BaseProfiler):
     def __call__(self, column_data: List[Any]):
-
         self.profile.count = len(column_data)
         self.profile.missing = sum(1 for val in column_data if val != val)
 
@@ -351,9 +344,7 @@ def __call__(self, column_data: List[Any]):
 
 
 class NumericProfiler(BaseProfiler):
-
     def __call__(self, column_data: List[Any]):
-
         self.profile.count = len(column_data)
         column_data = numpy.array(column_data, copy=False)  # Ensure column_data is a NumPy array
         if column_data.dtype.name == "object":
@@ -385,9 +376,7 @@ def __call__(self, column_data: List[Any]):
 
 
 class VarcharProfiler(BaseProfiler):
-
     def __call__(self, column_data: List[Any]):
-
         self.profile.count = len(column_data)
         column_data = [col for col in column_data if col is not None]
         if len(column_data) > 0:
@@ -405,9 +394,7 @@ def __call__(self, column_data: List[Any]):
 
 
 class DateProfiler(BaseProfiler):
-
     def __call__(self, column_data: List[Any]):
-
         self.profile.count = len(column_data)
         if hasattr(column_data[0], "value"):
             column_data = numpy.array(

diff --git a/orso/row.py b/orso/row.py
@@ -160,7 +160,7 @@ def serialize(value):
             if isinstance(value, datetime.datetime):
                 return ("__datetime__", value.timestamp())
             if isinstance(value, numpy.ndarray):
-                return list(value)
+                return value.tolist()
             return str(value)
 
         record_bytes = packb(tuple(self), option=OPT_SERIALIZE_NUMPY, default=serialize)

diff --git a/orso/schema.py b/orso/schema.py
@@ -27,7 +27,7 @@
     - Each column type provides a 'materialize' method to expand the compressed data
       into its uncompressed form, facilitating query operations that require a full
       column of data.
-    
+
 Column Types:
     - SparseColumn: Handles sparse data by only storing non-default values.
     - DictionaryColumn: Uses a dictionary to encode a finite set of string or

diff --git a/orso/tools.py b/orso/tools.py
@@ -24,6 +24,8 @@
 from typing import Type
 from typing import Union
 
+import numpy
+
 from orso.exceptions import MissingDependencyError
 
 
@@ -219,8 +221,7 @@ def report(self: Callable) -> str:
             str: The formatted report string.
         """
         stats = (
-            f"\nExecution Statistics for `{self.__name__}`\n  "
-            f"Count   : {self.count}\n"  # type:ignore
+            f"\nExecution Statistics for `{self.__name__}`\n  " f"Count   : {self.count}\n"  # type:ignore
         )
         if self.count > 0:  # type:ignore
             stats += f"  Average : {sum(self._run_times) / self.count} seconds\n"  # type:ignore
@@ -595,3 +596,95 @@ def random_string(width: int = 16) -> str:
     rand_bytes = getrandbits(num_chars)  # Generate random bits
     # Convert to hex string, clip '0x' prefix, and zero-fill as needed
     return ("000000" + hex(rand_bytes)[2:])[-width:]
+
+
+def parse_iso(value):
+    # Date validation at speed is hard, dateutil is great but really slow, this is fast
+    # but error-prone. It assumes it is a date or it really nothing like a date.
+    # Making that assumption - and accepting the consequences - we can convert up to
+    # three times faster than dateutil.
+    #
+    # valid formats (not exhaustive):
+    #
+    #   YYYY-MM-DD                 <- date
+    #   YYYY-MM-DD HH:MM           <- date and time, no seconds
+    #   YYYY-MM-DDTHH:MM           <- date and time, T separator
+    #   YYYY-MM-DD HH:MM:SS        <- date and time with seconds
+    #   YYYY-MM-DD HH:MM:SS.mmmm   <- date and time with milliseconds
+    #
+    # If the last character is a Z, we ignore it.
+    # If we can't parse as a date we return None rather than error
+    try:
+        input_type = type(value)
+        if input_type == str and value.isdigit():
+            value = int(value)
+            input_type = int
+
+        if input_type == numpy.datetime64:
+            # this can create dates rather than datetimes, so don't return yet
+            value = value.astype(datetime.datetime)
+            input_type = type(value)
+            if input_type is int:
+                value /= 1000000000
+
+        if input_type in (int, numpy.int64, float, numpy.float64):
+            return datetime.datetime.fromtimestamp(int(value), tz=datetime.timezone.utc).replace(
+                tzinfo=None
+            )
+
+        if input_type == datetime.datetime:
+            return value.replace(microsecond=0)
+        if input_type == datetime.date:
+            return datetime.datetime.combine(value, datetime.time.min)
+
+        # if we're here, we're doing string parsing
+        if input_type == str and 10 <= len(value) <= 33:
+            if value[-1] == "Z":
+                value = value[:-1]
+            if "+" in value:
+                value = value.split("+")[0]
+                if not 10 <= len(value) <= 28:
+                    return None
+            val_len = len(value)
+            if value[4] != "-" or value[7] != "-":
+                return None
+            if val_len == 10:
+                # YYYY-MM-DD
+                return datetime.datetime(
+                    *map(int, [value[:4], value[5:7], value[8:10]])  # type:ignore
+                )
+            if val_len >= 16:
+                if value[10] not in ("T", " ") and value[13] != ":":
+                    return None
+                if val_len >= 19 and value[16] == ":":
+                    # YYYY-MM-DD HH:MM:SS
+                    return datetime.datetime(
+                        *map(  # type:ignore
+                            int,
+                            [
+                                value[:4],  # YYYY
+                                value[5:7],  # MM
+                                value[8:10],  # DD
+                                value[11:13],  # HH
+                                value[14:16],  # MM
+                                value[17:19],  # SS
+                            ],
+                        )
+                    )
+                if val_len == 16:
+                    # YYYY-MM-DD HH:MM
+                    return datetime.datetime(
+                        *map(  # type:ignore
+                            int,
+                            [
+                                value[:4],
+                                value[5:7],
+                                value[8:10],
+                                value[11:13],
+                                value[14:16],
+                            ],
+                        )
+                    )
+        return None
+    except (ValueError, TypeError):
+        return None
diff --git a/orso/types.py b/orso/types.py
@@ -13,6 +13,9 @@
 import datetime
 import decimal
 from enum import Enum
+from typing import Any
+
+from orso.tools import parse_iso
 
 
 class OrsoTypes(str, Enum):
@@ -54,6 +57,9 @@ def is_complex(self):
     def __str__(self):
         return self.value
 
+    def parse(self, value: Any) -> Any:
+        return ORSO_TO_PYTHON_PARSER[self.value](value)
+
 
 ORSO_TO_PYTHON_MAP: dict = {
     OrsoTypes.BOOLEAN: bool,
@@ -76,3 +82,20 @@ def __str__(self):
     value: key for key, value in ORSO_TO_PYTHON_MAP.items() if key != OrsoTypes.BSON
 }
 PYTHON_TO_ORSO_MAP.update({tuple: OrsoTypes.ARRAY, set: OrsoTypes.ARRAY})  # map other python types
+
+ORSO_TO_PYTHON_PARSER: dict = {
+    OrsoTypes.BOOLEAN: bool,
+    OrsoTypes.BLOB: bytes,
+    OrsoTypes.DATE: lambda x: parse_iso(x).date(),
+    OrsoTypes.TIMESTAMP: parse_iso,
+    OrsoTypes.TIME: lambda x: parse_iso(x).time(),
+    OrsoTypes.INTERVAL: datetime.timedelta,
+    OrsoTypes.STRUCT: dict,
+    OrsoTypes.DECIMAL: decimal.Decimal,
+    OrsoTypes.DOUBLE: float,
+    OrsoTypes.INTEGER: int,
+    OrsoTypes.ARRAY: list,
+    OrsoTypes.VARCHAR: str,
+    OrsoTypes.BSON: bytes,
+    OrsoTypes.NULL: lambda x: None,
+}
diff --git a/orso/version.py b/orso/version.py
@@ -10,5 +10,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "0.0.160"
+__version__: str = "0.0.161"
 __author__: str = "@joocer"
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,3 +14,16 @@ float_to_top = true
 [build-system]
 requires = ["setuptools>=42", "wheel", "Cython", "numpy"]
 build-backend = "setuptools.build_meta"
+
+[tool.ruff]
+line-length = 100
+indent-width = 4
+target-version = 'py310'
+
+[tool.ruff.format]
+docstring-code-format = true
+docstring-code-line-length = 100
+
+[tool.ruff.lint]
+select = ["SIM"]
+ignore = []
diff --git a/setup.py b/setup.py
@@ -13,10 +13,7 @@ def is_mac():  # pragma: no cover
     return platform.system().lower() == "darwin"
 
 
-if is_mac():
-    COMPILE_FLAGS = ["-O2"]
-else:
-    COMPILE_FLAGS = ["-O2", "-march=native"]
+COMPILE_FLAGS = ["-O2"] if is_mac() else   ["-O2", "-march=native"]
 
 __version__ = "notset"
 with open(f"{LIBRARY}/version.py", mode="r") as v: