diff --git a/changelog.md b/changelog.md index d3e232d3..fd08949a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,9 @@ # Changelog +## Unreleased +### Fixed +- Caching in spark instead of koalas to improve speed + ## v0.1.6 (2023-09-27) ### Added - Module ``event_sequences`` to visualize individual sequences of events. diff --git a/eds_scikit/io/hive.py b/eds_scikit/io/hive.py index c36e58dc..2c30e698 100644 --- a/eds_scikit/io/hive.py +++ b/eds_scikit/io/hive.py @@ -12,7 +12,6 @@ from pyspark.sql import SparkSession from pyspark.sql.types import LongType, StructField, StructType -from ..utils.framework import bd from . import settings from .base import BaseData from .data_quality import clean_dates @@ -227,12 +226,10 @@ def _read_table( if "person_id" in df.columns and person_ids is not None: df = df.join(person_ids, on="person_id", how="inner") - df = df.to_koalas() + df = df.cache().to_koalas() df = clean_dates(df) - bd.cache(df) - return df def persist_tables_to_folder(