From 9b59685891ad0848bd94b559d5396b56d4fb2b75 Mon Sep 17 00:00:00 2001 From: svittoz Date: Thu, 16 Nov 2023 09:04:49 +0000 Subject: [PATCH 1/3] caching in spark --- eds_scikit/io/hive.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/eds_scikit/io/hive.py b/eds_scikit/io/hive.py index c36e58dc..b679c9fb 100644 --- a/eds_scikit/io/hive.py +++ b/eds_scikit/io/hive.py @@ -227,12 +227,10 @@ def _read_table( if "person_id" in df.columns and person_ids is not None: df = df.join(person_ids, on="person_id", how="inner") - df = df.to_koalas() + df = df.cache().to_koalas() df = clean_dates(df) - bd.cache(df) - return df def persist_tables_to_folder( From 5ac07ddddd0eb361cb512398cc6b48298b4b7381 Mon Sep 17 00:00:00 2001 From: svittoz Date: Tue, 21 Nov 2023 09:50:03 +0000 Subject: [PATCH 2/3] changelog --- changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog.md b/changelog.md index d3e232d3..fd08949a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,9 @@ # Changelog +## Unreleased +### Fixed +- Caching in spark instead of koalas to improve speed + ## v0.1.6 (2023-09-27) ### Added - Module ``event_sequences`` to visualize individual sequences of events. From 176fb151bea12f004dd0636135cc00dde6f2a0c5 Mon Sep 17 00:00:00 2001 From: svittoz Date: Tue, 21 Nov 2023 09:53:17 +0000 Subject: [PATCH 3/3] unused import --- eds_scikit/io/hive.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eds_scikit/io/hive.py b/eds_scikit/io/hive.py index b679c9fb..2c30e698 100644 --- a/eds_scikit/io/hive.py +++ b/eds_scikit/io/hive.py @@ -12,7 +12,6 @@ from pyspark.sql import SparkSession from pyspark.sql.types import LongType, StructField, StructType -from ..utils.framework import bd from . import settings from .base import BaseData from .data_quality import clean_dates