From e43e6ed82276b33dd92c3d6ae7a53b5d81295a3e Mon Sep 17 00:00:00 2001 From: mbaak Date: Fri, 19 Apr 2024 21:45:25 +0200 Subject: [PATCH] ENH: added functions to set format and storage options of spark dataframes Functions to set format and storage options of spark dataframes when calling spark namematching save. Example usage: nm_obj.write().format('parquet').options(**options_dict).save(path) --- emm/helper/spark_custom_reader_writer.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/emm/helper/spark_custom_reader_writer.py b/emm/helper/spark_custom_reader_writer.py index f941e59..025a854 100644 --- a/emm/helper/spark_custom_reader_writer.py +++ b/emm/helper/spark_custom_reader_writer.py @@ -214,6 +214,30 @@ def _get_metadata_to_save(self): } return json.dumps(metadata, separators=[",", ":"]) + def format(self, file_format: str): + """Set the file format of ground truth datasets that are saved + + Args: + file_format: storage format of spark dataframes, default is parquet. + + Returns: + self + """ + self.file_format = file_format + return self + + def options(self, **kwargs): + """Set the other file storage options of ground truth datasets that are saved + + Args: + kwargs: storage kw-args, passed on to: sdf.write.save(path, format=self.file_format, **self.kwargs) + + Returns: + self + """ + self.store_kws = kwargs + return self + class SparkCustomReader(MLReader): """Spark Custom class reader"""