Skip to content

Commit

Permalink
fixing all pre-commit checks
Browse files Browse the repository at this point in the history
  • Loading branch information
fonhorst committed Aug 2, 2023
1 parent cb59861 commit 1aa0b9e
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 15 deletions.
11 changes: 7 additions & 4 deletions sparklightautoml/ml_algo/boost_lgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,10 +473,13 @@ def fit_predict_single_fold(

rows_count = full_data.count()
if (run_params["executionMode"] == "streaming") and (rows_count <= 25_000):
warnings.warn(f"The fitting of lightgbm in streaming execution mode "
f"may fail with SEGSIGV / SIGBUS error (probably due to a bug in synapse ml) "
f"if too few data available per core. Train data rows count: {rows_count} "
f"Consider switching to bulk execution mode if such crashes happen", RuntimeWarning)
warnings.warn(
f"The fitting of lightgbm in streaming execution mode "
f"may fail with SEGSIGV / SIGBUS error (probably due to a bug in synapse ml) "
f"if too few data available per core. Train data rows count: {rows_count} "
f"Consider switching to bulk execution mode if such crashes happen",
RuntimeWarning,
)

# fitting the model
ml_model = lgbm.fit(self._assembler.transform(full_data))
Expand Down
15 changes: 4 additions & 11 deletions tests/spark/unit/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import os
import pickle
import shutil

from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple

import pyspark.sql.functions as sf

from pyspark.sql import SparkSession

from sparklightautoml.dataset.base import PersistenceManager
from sparklightautoml.dataset.base import SparkDataset
from sparklightautoml.tasks.base import SparkTask


DUMP_METADATA_NAME = "metadata.pickle"
DUMP_DATA_NAME = "data.parquet"

Expand Down Expand Up @@ -117,7 +120,6 @@ def load_dump_if_exist(
"franchise_dealer": "str",
},
},

"used_cars_dataset_no_cols_limit": {
"path": "examples/data/small_used_cars_data.csv",
"task_type": "reg",
Expand All @@ -140,15 +142,13 @@ def load_dump_if_exist(
"franchise_dealer": "str",
},
},

"lama_test_dataset": {
"path": "examples/data/sampled_app_train.csv",
"task_type": "binary",
"metric_name": "areaUnderROC",
"target_col": "TARGET",
"roles": {"target": "TARGET", "drop": ["SK_ID_CURR"]},
},

# https://www.openml.org/d/734
"ailerons_dataset": {
"path": "examples/data/ailerons.csv",
Expand All @@ -157,7 +157,6 @@ def load_dump_if_exist(
"target_col": "binaryClass",
"roles": {"target": "binaryClass"},
},

# https://www.openml.org/d/4534
"phishing_websites_dataset": {
"path": "examples/data/PhishingWebsites.csv",
Expand All @@ -166,7 +165,6 @@ def load_dump_if_exist(
"target_col": "Result",
"roles": {"target": "Result"},
},

# https://www.openml.org/d/981
"kdd_internet_usage": {
"path": "examples/data/kdd_internet_usage.csv",
Expand All @@ -175,7 +173,6 @@ def load_dump_if_exist(
"target_col": "Who_Pays_for_Access_Work",
"roles": {"target": "Who_Pays_for_Access_Work"},
},

# https://www.openml.org/d/42821
"nasa_dataset": {
"path": "examples/data/nasa_phm2008.csv",
Expand All @@ -184,7 +181,6 @@ def load_dump_if_exist(
"target_col": "class",
"roles": {"target": "class"},
},

# https://www.openml.org/d/4549
"buzz_dataset": {
"path": "examples/data/Buzzinsocialmedia_Twitter_25k.csv",
Expand All @@ -193,7 +189,6 @@ def load_dump_if_exist(
"target_col": "Annotation",
"roles": {"target": "Annotation"},
},

# https://www.openml.org/d/372
"internet_usage": {
"path": "examples/data/internet_usage.csv",
Expand All @@ -202,7 +197,6 @@ def load_dump_if_exist(
"target_col": "Actual_Time",
"roles": {"target": "Actual_Time"},
},

# https://www.openml.org/d/4538
"gesture_segmentation": {
"path": "examples/data/gesture_segmentation.csv",
Expand All @@ -211,15 +205,14 @@ def load_dump_if_exist(
"target_col": "Phase",
"roles": {"target": "Phase"},
},

# https://www.openml.org/d/382
"ipums_97": {
"path": "examples/data/ipums_97.csv",
"task_type": "multiclass",
"metric_name": "crossentropy",
"target_col": "movedin",
"roles": {"target": "movedin"},
}
},
}


Expand Down

0 comments on commit 1aa0b9e

Please sign in to comment.