From 5dc6fc5eff94d377bdc78b82be76b8b8cab71eb4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 20:44:59 +0800 Subject: [PATCH 01/22] add unit tests for tune api Signed-off-by: helenxie-bit --- Makefile | 1 + .../kubeflow/katib/api/katib_client.py | 57 +- test/unit/v1beta1/tune-api/test_tune_api.py | 911 ++++++++++++++++++ 3 files changed, 950 insertions(+), 19 deletions(-) create mode 100644 test/unit/v1beta1/tune-api/test_tune_api.py diff --git a/Makefile b/Makefile index a6708de7f5b..e3d62a8ca38 100755 --- a/Makefile +++ b/Makefile @@ -172,6 +172,7 @@ pytest: prepare-pytest prepare-pytest-testdata pytest ./test/unit/v1beta1/suggestion --ignore=./test/unit/v1beta1/suggestion/test_skopt_service.py pytest ./test/unit/v1beta1/earlystopping pytest ./test/unit/v1beta1/metricscollector + pytest ./test/unit/v1beta1/tune-api cp ./pkg/apis/manager/v1beta1/python/api_pb2.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2.py cp ./pkg/apis/manager/v1beta1/python/api_pb2_grpc.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py sed -i "s/api_pb2/kubeflow\.katib\.katib_api_pb2/g" ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 05fd1405a3f..5de5f5dbee9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -625,27 +625,46 @@ class name in this argument. volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - container_spec = training_utils.get_container_spec( - name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], - base_image=TRAINER_TRANSFORMER_IMAGE, - args=[ - "--model_uri", - model_provider_parameters.model_uri, - "--transformer_type", - model_provider_parameters.transformer_type.__name__, - "--model_dir", - VOLUME_PATH_MODEL, - "--dataset_dir", - VOLUME_PATH_DATASET, - "--lora_config", - f"'{lora_config}'", - "--training_parameters", - f"'{training_args}'", - ], - volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, + storage_initializer_volume = models.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=name + ), ) + if isinstance(resources_per_trial, types.TrainerResources): + from kubeflow.training import models as training_models + + if ( + resources_per_trial.num_workers is None + or resources_per_trial.num_workers < 1 + ): + raise ValueError("At least one Worker for PyTorchJob must be set") + + # Create container spec. + container_spec = utils.get_container_spec( + name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, + base_image=TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{json.dumps(lora_config.__dict__, cls=utils.SetEncoder)}'", + "--training_parameters", + f"'{json.dumps(training_args.to_dict())}'", + ], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + resources=resources_per_trial.resources_per_worker, + ) + # Create the worker and the master pod. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py new file mode 100644 index 00000000000..bbd1b0ef111 --- /dev/null +++ b/test/unit/v1beta1/tune-api/test_tune_api.py @@ -0,0 +1,911 @@ +import unittest +from unittest import TestCase +from unittest.mock import Mock +from unittest.mock import patch + +from kubeflow.katib import KatibClient +from kubeflow.katib import models +import kubeflow.katib as katib +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams +from kubeflow.training import models as training_models +from kubernetes import client +from kubernetes.client.exceptions import ApiException +from peft import LoraConfig +import transformers + + +class TestTuneAPI(TestCase): + # Create an instance of the KatibClient + def setUp(self): + self.katib_client = KatibClient(namespace="default") + + # Test input + # Test for missing required parameters + def test_tune_missing_name(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name=None, + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + ) + + self.assertIn("Please specify name for the Experiment.", str(context.exception)) + + # Test for invalid hyperparameter optimization configuration + # Case 1: Set two options: 1) external models and datasets; 2) custom objective at the same time + def test_tune_invalid_with_model_provider_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_dataset_provider_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_trainer_parameters_and_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + objective=lambda x: x, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_model_provider_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_dataset_provider_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + def test_tune_invalid_with_trainer_parameters_and_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + parameters={"lr": Mock()}, + ) + + self.assertIn("Invalid configuration", str(context.exception)) + + # Case 2: Missing parameters when choosing one option + def test_tune_invalid_with_only_model_provider(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_dataset_provider(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + dataset_provider_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_trainer_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + trainer_parameters=Mock(), + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_objective(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + def test_tune_invalid_with_only_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + parameters={"lr": Mock()}, + ) + + self.assertIn("One of the required parameters is None", str(context.exception)) + + # Case 3: No parameters provided + def test_tune_no_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune(name="experiment") + + self.assertIn("Invalid configuration", str(context.exception)) + + # Test for invalid parameters + # Case 1: Invalid env_per_trial + def test_tune_invalid_env_per_trial(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + env_per_trial=[123], # Invalid type + ) + + self.assertIn("Incorrect value for env_per_trial", str(context.exception)) + + # Case 2: Invalid resources_per_trial.num_workers (for distributed training) + def test_tune_invalid_resources_per_trial_value(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + resources_per_trial=katib.TrainerResources( + num_workers=0, # Invalid value, should be at least 1 + num_procs_per_worker=1, + resources_per_worker={"cpu": "1", "memory": "1Gi"}, + ), + ) + + self.assertIn( + "At least one Worker for PyTorchJob must be set", str(context.exception) + ) + + # Case 3: Invalid model_provider_parameters + def test_tune_invalid_model_provider_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceModelParams + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + ) + + self.assertIn( + "Model provider parameters must be an instance of HuggingFaceModelParams", + str(context.exception), + ) + + # Case 4: Invalid dataset_provider_parameters + def test_tune_invalid_dataset_provider_parameters(self): + with self.assertRaises(ValueError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + dataset_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceDatasetParameters or S3DatasetParams + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + ) + + self.assertIn( + "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams", + str(context.exception), + ) + + # Case 5: Invalid trainer_parameters.training_parameters + def test_tune_invalid_trainer_parameters_training_parameters(self): + with self.assertRaises(TypeError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + not_a_valid_parameter="no", + ), + lora_config=LoraConfig(), + ), + ) + + self.assertIn( + "TrainingArguments.__init__() got an unexpected keyword argument", + str(context.exception), + ) + + # Case 6: Invalid trainer_parameters.lora_config + def test_tune_invalid_trainer_parameters_lora_config(self): + with self.assertRaises(TypeError) as context: + self.katib_client.tune( + name="experiment", + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + ), + lora_config=LoraConfig( + not_a_valid_parameter="no", + ), + ), + ) + + self.assertIn( + "LoraConfig.__init__() got an unexpected keyword argument", + str(context.exception), + ) + + # Test functionality + # Test PVC creation + # Case 1: PVC successfully created + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation(self, mock_create_experiment, mock_list_pvc, mock_create_pvc): + mock_create_pvc.return_value = Mock() + mock_list_pvc.return_value = Mock(items=[]) + mock_create_experiment.return_value = Mock() + + exp_name = "experiment" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + expected_pvc_spec = models.V1PersistentVolumeClaim( + api_version="v1", + kind="PersistentVolumeClaim", + metadata={"name": exp_name, "namespace": "default"}, + spec=models.V1PersistentVolumeClaimSpec( + access_modes=storage_config["access_modes"], + resources=models.V1ResourceRequirements( + requests={"storage": storage_config["size"]} + ), + ), + ) + + mock_create_pvc.assert_called_once_with( + namespace="default", body=expected_pvc_spec + ) + + # Case 2: PVC already exists + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation_with_existing_pvc( + self, mock_create_experiment, mock_list_pvc, mock_create_pvc + ): + # Simulate an ApiException being raised when trying to create a PVC + mock_create_pvc.side_effect = ApiException(status=409, reason="Already exists") + + # Simulate existing PVC in the list + mock_existing_pvc = Mock() + mock_existing_pvc.metadata.name = "test-pvc" + mock_list_pvc.return_value = Mock(items=[mock_existing_pvc]) + + mock_create_experiment.return_value = Mock() + + exp_name = "test-pvc" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + # Assert that create_namespaced_persistent_volume_claim was called once + mock_create_pvc.assert_called_once() + + # Assert that list_namespaced_persistent_volume_claim was called to check existing PVCs + mock_list_pvc.assert_called_once_with("default") + + # Ensure no exception is raised since the PVC already exists + self.assertTrue(mock_list_pvc.return_value.items[0].metadata.name == exp_name) + + # Case 3: PVC creation fails + @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") + @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_pvc_creation_fails( + self, mock_create_experiment, mock_list_pvc, mock_create_pvc + ): + # Simulate an ApiException being raised when trying to create a PVC + mock_create_pvc.side_effect = ApiException( + status=500, reason="Internal Server Error" + ) + + # Simulate no existing PVC in the list + mock_list_pvc.return_value = Mock(items=[]) + + mock_create_experiment.return_value = Mock() + + exp_name = "test-pvc" + storage_config = { + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + } + with self.assertRaises(RuntimeError) as context: + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + storage_config=storage_config, + ) + + # Assert that the appropriate error message is raised + self.assertIn("failed to create PVC", str(context.exception)) + + # Assert that create_namespaced_persistent_volume_claim was called once + mock_create_pvc.assert_called_once() + + # Assert that list_namespaced_persistent_volume_claim was called once + mock_list_pvc.assert_called_once_with("default") + + # Test container, pod, job/pytorchjob, trial template, and experiment creation + # Case 1: Custom objective + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_custom_objective( + self, mock_create_experiment + ): + self.katib_client.tune( + name="experiment", + objective=lambda x: x, + parameters={ + "a": katib.search.int(min=10, max=100), + "b": katib.search.double(min=0.1, max=0.2), + }, + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial={"cpu": "1", "memory": "1Gi"}, + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_container = [ + models.V1Container( + name="training-container", + image="docker.io/tensorflow/tensorflow:2.13.0", + command=["bash", "-c"], + args=[ + "\n" + "program_path=$(mktemp -d)\n" + "read -r -d '' SCRIPT << EOM\n" + "\n" + "objective=lambda x: x,\n" + "\n" + "({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n" + "\n" + "EOM\n" + 'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n' + 'python3 -u "$program_path/ephemeral_script.py"' + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + ) + ] + + expected_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + containers=expected_container, + restart_policy="Never", + ), + ) + + expected_job = client.V1Job( + api_version="batch/v1", + kind="Job", + spec=client.V1JobSpec( + template=expected_pod, + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="training-container", + trial_parameters=[ + models.V1beta1TrialParameterSpec(name="a", reference="a"), + models.V1beta1TrialParameterSpec(name="b", reference="b"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="a", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="10", max="100"), + ), + models.V1beta1ParameterSpec( + name="b", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="0.1", max="0.2"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.template, expected_pod + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + + # Case 2: External models and datasets + @patch("kubeflow.katib.KatibClient.create_experiment") + def test_experiment_creation_with_external_model( + self, mock_create_experiment + ): + exp_name = "experiment" + self.katib_client.tune( + name=exp_name, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # Use 3000 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + save_strategy="no", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=1, + logging_dir="test_tune_api/logs", + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=katib.search.int(min=8, max=32), + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + objective_metric_name="accuracy", + objective_goal=0.9, + max_trial_count=10, + parallel_trial_count=2, + max_failed_trial_count=1, + resources_per_trial=katib.TrainerResources( + num_workers=3, + num_procs_per_worker=1, + resources_per_worker={"cpu": "1", "memory": "1Gi"}, + ), + ) + + mock_create_experiment.assert_called_once() + args, kwargs = mock_create_experiment.call_args + experiment = args[0] + + expected_init_container = [ + models.V1Container( + name="storage-initializer", + image="docker.io/kubeflow/storage-initializer", + args=[ + "--model_provider", + "hf", + "--model_provider_parameters", + '{"model_uri": "hf://google-bert/bert-base-cased", "transformer_type": "AutoModelForSequenceClassification", ' + '"access_token": null, "num_labels": 5}', + "--dataset_provider", + "hf", + "--dataset_provider_parameters", + '{"repo_id": "yelp_review_full", "access_token": null, "split": "train[:8]"}', + ], + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_container = [ + models.V1Container( + name="pytorch", + image="docker.io/kubeflow/trainer-huggingface", + args=[ + "--model_uri", + "hf://google-bert/bert-base-cased", + "--transformer_type", + "AutoModelForSequenceClassification", + "--num_labels", + "5", + "--model_dir", + "/workspace/model", + "--dataset_dir", + "/workspace/dataset", + "--lora_config", + '\'{"peft_type": "LORA", "base_model_name_or_path": null, "task_type": null, ' + '"inference_mode": false, "r": "${trialParameters.r}", "target_modules": null, ' + '"lora_alpha": 8, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", ' + '"modules_to_save": null, "init_lora_weights": true}\'', + "--training_parameters", + '\'{"output_dir": "test_tune_api", "overwrite_output_dir": false, "do_train": ' + 'false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", ' + '"prediction_loss_only": false, "per_device_train_batch_size": 8, ' + '"per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, ' + '"per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 1, ' + '"eval_accumulation_steps": null, "eval_delay": 0, "learning_rate": ' + '"${trialParameters.learning_rate}", "weight_decay": 0.0, "adam_beta1": 0.9, ' + '"adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, ' + '"num_train_epochs": 1, "max_steps": -1, "lr_scheduler_type": "linear", ' + '"lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, ' + '"log_level": "passive", "log_level_replica": "warning", "log_on_each_node": ' + 'true, "logging_dir": "test_tune_api/logs", "logging_strategy": "steps", ' + '"logging_first_step": false, "logging_steps": 500, "logging_nan_inf_filter": ' + 'true, "save_strategy": "no", "save_steps": 500, "save_total_limit": null, ' + '"save_safetensors": true, "save_on_each_node": false, "save_only_model": ' + 'false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": ' + '42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": ' + 'false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": ' + '"auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, ' + '"local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, ' + '"tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, ' + '"eval_steps": null, "dataloader_num_workers": 0, "dataloader_prefetch_factor": ' + 'null, "past_index": -1, "run_name": "test_tune_api", "disable_tqdm": false, ' + '"remove_unused_columns": true, "label_names": null, "load_best_model_at_end": ' + 'false, "metric_for_best_model": null, "greater_is_better": null, ' + '"ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, ' + '"fsdp_config": {"min_num_params": 0, "xla": false, "xla_fsdp_v2": false, ' + '"xla_fsdp_grad_ckpt": false}, "fsdp_transformer_layer_cls_to_wrap": null, ' + '"accelerator_config": {"split_batches": false, "dispatch_batches": null, ' + '"even_batches": true, "use_seedable_sampler": true}, "deepspeed": null, ' + '"label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, ' + '"adafactor": false, "group_by_length": false, "length_column_name": "length", ' + '"report_to": ["tensorboard"], "ddp_find_unused_parameters": null, ' + '"ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, ' + '"dataloader_pin_memory": true, "dataloader_persistent_workers": false, ' + '"skip_memory_metrics": true, "use_legacy_prediction_loop": false, ' + '"push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, ' + '"hub_strategy": "every_save", "hub_token": "", "hub_private_repo": ' + 'false, "hub_always_push": false, "gradient_checkpointing": false, ' + '"gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, ' + '"fp16_backend": "auto", "push_to_hub_model_id": null, ' + '"push_to_hub_organization": null, "push_to_hub_token": "", ' + '"mp_parameters": "", "auto_find_batch_size": false, "full_determinism": ' + 'false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, ' + '"torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": ' + 'null, "dispatch_batches": null, "split_batches": null, "include_tokens_per_' + 'second": false, "include_num_input_tokens_seen": false, ' + '"neftune_noise_alpha": null}\'', + ], + resources=models.V1ResourceRequirements( + requests={"cpu": "1", "memory": "1Gi"}, + limits={"cpu": "1", "memory": "1Gi"}, + ), + volume_mounts=[ + training_models.V1VolumeMount( + name="storage-initializer", + mount_path="/workspace", + ) + ], + ) + ] + + expected_master_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + init_containers=expected_init_container, + containers=expected_container, + volumes=[ + models.V1Volume( + name="storage-initializer", + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=exp_name + ), + ) + ], + ), + ) + + expected_worker_pod = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={"sidecar.istio.io/inject": "false"} + ), + spec=models.V1PodSpec( + containers=expected_container, + volumes=[ + models.V1Volume( + name="storage-initializer", + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=exp_name + ), + ) + ], + ), + ) + + expected_job = training_models.KubeflowOrgV1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + spec=training_models.KubeflowOrgV1PyTorchJobSpec( + run_policy=training_models.KubeflowOrgV1RunPolicy( + clean_pod_policy=None + ), + pytorch_replica_specs={ + "Master": training_models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=expected_master_pod, + ), + "Worker": training_models.KubeflowOrgV1ReplicaSpec( + replicas=2, + template=expected_worker_pod, + ), + }, + nproc_per_node="1", + ), + ) + + expected_trial_template = models.V1beta1TrialTemplate( + primary_container_name="pytorch", + trial_parameters=[ + models.V1beta1TrialParameterSpec( + name="learning_rate", reference="learning_rate" + ), + models.V1beta1TrialParameterSpec(name="r", reference="r"), + ], + retain=False, + trial_spec=expected_job, + ) + + expected_parameters = [ + models.V1beta1ParameterSpec( + name="learning_rate", + parameter_type="double", + feasible_space=models.V1beta1FeasibleSpace(min="1e-05", max="5e-05"), + ), + models.V1beta1ParameterSpec( + name="r", + parameter_type="int", + feasible_space=models.V1beta1FeasibleSpace(min="8", max="32"), + ), + ] + + self.assertEqual(experiment.spec.objective.type, "maximize") + self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") + self.assertEqual(experiment.spec.objective.goal, 0.9) + self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") + self.assertEqual(experiment.spec.max_trial_count, 10) + self.assertEqual(experiment.spec.parallel_trial_count, 2) + self.assertEqual(experiment.spec.max_failed_trial_count, 1) + self.assertEqual(experiment.spec.parameters, expected_parameters) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template.spec.init_containers, + expected_init_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].replicas, + 1, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ].template, + expected_master_pod, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template.spec.containers, + expected_container, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].replicas, + 2, + ) + self.assertEqual( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Worker" + ].template, + expected_worker_pod, + ) + self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) + self.assertEqual(experiment.spec.trial_template, expected_trial_template) + +if __name__ == "__main__": + unittest.main() From 04a7e3904a2fff1fe798b847fff1c7dc326b7f6e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 20:56:37 +0800 Subject: [PATCH 02/22] update Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client.py | 59 +++++++------------ 1 file changed, 21 insertions(+), 38 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 5de5f5dbee9..49c5d88e584 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -625,46 +625,29 @@ class name in this argument. volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], ) - storage_initializer_volume = models.V1Volume( - name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=name - ), + container_spec = training_utils.get_container_spec( + name=JOB_PARAMETERS[PYTORCHJOB_KIND]["container"], + base_image=TRAINER_TRANSFORMER_IMAGE, + args=[ + "--model_uri", + model_provider_parameters.model_uri, + "--transformer_type", + model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), + "--model_dir", + VOLUME_PATH_MODEL, + "--dataset_dir", + VOLUME_PATH_DATASET, + "--lora_config", + f"'{lora_config}'", + "--training_parameters", + f"'{training_args}'", + ], + volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], + resources=resources_per_trial.resources_per_worker, ) - if isinstance(resources_per_trial, types.TrainerResources): - from kubeflow.training import models as training_models - - if ( - resources_per_trial.num_workers is None - or resources_per_trial.num_workers < 1 - ): - raise ValueError("At least one Worker for PyTorchJob must be set") - - # Create container spec. - container_spec = utils.get_container_spec( - name=constants.PYTORCHJOB_PRIMARY_CONTAINER_NAME, - base_image=TRAINER_TRANSFORMER_IMAGE, - args=[ - "--model_uri", - model_provider_parameters.model_uri, - "--transformer_type", - model_provider_parameters.transformer_type.__name__, - "--num_labels", - str(model_provider_parameters.num_labels), - "--model_dir", - VOLUME_PATH_MODEL, - "--dataset_dir", - VOLUME_PATH_DATASET, - "--lora_config", - f"'{json.dumps(lora_config.__dict__, cls=utils.SetEncoder)}'", - "--training_parameters", - f"'{json.dumps(training_args.to_dict())}'", - ], - volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, - ) - # Create the worker and the master pod. storage_initializer_volume = models.V1Volume( name=STORAGE_INITIALIZER, From 8c4d65ac6bea775eb3e3f1f45c390cecc3ee471c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 21:00:41 +0800 Subject: [PATCH 03/22] fix format Signed-off-by: helenxie-bit --- test/unit/v1beta1/tune-api/test_tune_api.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py index bbd1b0ef111..3d7b04ff25e 100644 --- a/test/unit/v1beta1/tune-api/test_tune_api.py +++ b/test/unit/v1beta1/tune-api/test_tune_api.py @@ -1,19 +1,19 @@ import unittest from unittest import TestCase -from unittest.mock import Mock -from unittest.mock import patch +from unittest.mock import Mock, patch -from kubeflow.katib import KatibClient -from kubeflow.katib import models import kubeflow.katib as katib -from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams +import transformers +from kubeflow.katib import KatibClient, models +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) from kubeflow.training import models as training_models from kubernetes import client from kubernetes.client.exceptions import ApiException from peft import LoraConfig -import transformers class TestTuneAPI(TestCase): From b0195a63048ed6321c0118027c8359526a6c2b56 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 23:16:43 +0800 Subject: [PATCH 04/22] update unit tests and fix api errors Signed-off-by: helenxie-bit --- .github/workflows/test-python.yaml | 10 ++++++ .../kubeflow/katib/api/katib_client.py | 12 +++++-- test/unit/v1beta1/tune-api/test_tune_api.py | 35 ++++--------------- 3 files changed, 26 insertions(+), 31 deletions(-) diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index fe1b7cf68b5..aa4c6043d94 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -22,6 +22,16 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.11 + + - name: Install Katib SDK + shell: bash + run: pip install --prefer-binary -e sdk/python/v1beta1 + + - name: Install Training Operator SDK + shell: bash + run: | + pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python + pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0 - name: Run Python test run: make pytest diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 49c5d88e584..e675162abea 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -416,6 +416,12 @@ class name in this argument. # If users choose to use a custom objective function. if objective is not None: + if ( + not base_image + or not parameters + ): + raise ValueError("One of the required parameters is None") + # Add metrics collector to the Katib Experiment. # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. @@ -645,7 +651,7 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, + resources=resources_per_trial.resources_per_worker if resources_per_trial else None, ) # Create the worker and the master pod. @@ -679,7 +685,7 @@ class name in this argument. ), ) - if resources_per_trial.num_procs_per_worker: + if resources_per_trial is not None and resources_per_trial.num_procs_per_worker: pytorchjob.spec.nproc_per_node = str( resources_per_trial.num_procs_per_worker ) @@ -691,7 +697,7 @@ class name in this argument. ) ) - if resources_per_trial.num_workers > 1: + if resources_per_trial is not None and resources_per_trial.num_workers > 1: pytorchjob.spec.pytorch_replica_specs["Worker"] = ( training_models.KubeflowOrgV1ReplicaSpec( replicas=resources_per_trial.num_workers - 1, diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py index 3d7b04ff25e..4772c6fa7dc 100644 --- a/test/unit/v1beta1/tune-api/test_tune_api.py +++ b/test/unit/v1beta1/tune-api/test_tune_api.py @@ -167,28 +167,7 @@ def test_tune_invalid_env_per_trial(self): self.assertIn("Incorrect value for env_per_trial", str(context.exception)) - # Case 2: Invalid resources_per_trial.num_workers (for distributed training) - def test_tune_invalid_resources_per_trial_value(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - objective=lambda x: x, - parameters={ - "a": katib.search.int(min=10, max=100), - "b": katib.search.double(min=0.1, max=0.2), - }, - resources_per_trial=katib.TrainerResources( - num_workers=0, # Invalid value, should be at least 1 - num_procs_per_worker=1, - resources_per_worker={"cpu": "1", "memory": "1Gi"}, - ), - ) - - self.assertIn( - "At least one Worker for PyTorchJob must be set", str(context.exception) - ) - - # Case 3: Invalid model_provider_parameters + # Case 2: Invalid model_provider_parameters def test_tune_invalid_model_provider_parameters(self): with self.assertRaises(ValueError) as context: self.katib_client.tune( @@ -221,7 +200,7 @@ def test_tune_invalid_model_provider_parameters(self): str(context.exception), ) - # Case 4: Invalid dataset_provider_parameters + # Case 3: Invalid dataset_provider_parameters def test_tune_invalid_dataset_provider_parameters(self): with self.assertRaises(ValueError) as context: self.katib_client.tune( @@ -255,7 +234,7 @@ def test_tune_invalid_dataset_provider_parameters(self): str(context.exception), ) - # Case 5: Invalid trainer_parameters.training_parameters + # Case 4: Invalid trainer_parameters.training_parameters def test_tune_invalid_trainer_parameters_training_parameters(self): with self.assertRaises(TypeError) as context: self.katib_client.tune( @@ -283,7 +262,7 @@ def test_tune_invalid_trainer_parameters_training_parameters(self): str(context.exception), ) - # Case 6: Invalid trainer_parameters.lora_config + # Case 5: Invalid trainer_parameters.lora_config def test_tune_invalid_trainer_parameters_lora_config(self): with self.assertRaises(TypeError) as context: self.katib_client.tune( @@ -545,9 +524,9 @@ def test_experiment_creation_with_custom_objective( "({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n" "\n" "EOM\n" - 'printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py"\n' - 'python3 -u "$program_path/ephemeral_script.py"' - ], + 'printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py\n' + 'python3 -u $program_path/ephemeral_objective.py' + ], resources=models.V1ResourceRequirements( requests={"cpu": "1", "memory": "1Gi"}, limits={"cpu": "1", "memory": "1Gi"}, From a92de6755cd8282ac006c9fcd10ce1f4cc04772a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 23:19:05 +0800 Subject: [PATCH 05/22] fix format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index e675162abea..2453774a70d 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -416,12 +416,9 @@ class name in this argument. # If users choose to use a custom objective function. if objective is not None: - if ( - not base_image - or not parameters - ): + if not base_image or not parameters: raise ValueError("One of the required parameters is None") - + # Add metrics collector to the Katib Experiment. # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. @@ -651,7 +648,11 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker if resources_per_trial else None, + resources=( + resources_per_trial.resources_per_worker + if resources_per_trial + else None + ), ) # Create the worker and the master pod. @@ -685,7 +686,10 @@ class name in this argument. ), ) - if resources_per_trial is not None and resources_per_trial.num_procs_per_worker: + if ( + resources_per_trial is not None + and resources_per_trial.num_procs_per_worker + ): pytorchjob.spec.nproc_per_node = str( resources_per_trial.num_procs_per_worker ) From 7b7e3479406dbf9303686bd29d38d135c876b852 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 23:43:54 +0800 Subject: [PATCH 06/22] test Signed-off-by: helenxie-bit --- .github/workflows/test-python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index aa4c6043d94..4ddbd79c3a9 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -21,7 +21,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.10 - name: Install Katib SDK shell: bash From e4f7922ac448257485dcd6377d2b4aa663ba0d15 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 5 Sep 2024 23:50:37 +0800 Subject: [PATCH 07/22] test Signed-off-by: helenxie-bit --- .github/workflows/test-python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 4ddbd79c3a9..c530bfafe43 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -21,7 +21,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.10 + python-version: '3.10' - name: Install Katib SDK shell: bash From e621fc625e081755633bc9aa9922f1f5ec59f8a0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 9 Sep 2024 00:34:35 -0700 Subject: [PATCH 08/22] update unit tests Signed-off-by: helenxie-bit --- .github/workflows/test-python.yaml | 12 +- .../kubeflow/katib/api/katib_client.py | 5 +- .../kubeflow/katib/api/katib_client_test.py | 296 ++++++ test/unit/v1beta1/tune-api/test_tune_api.py | 890 ------------------ 4 files changed, 300 insertions(+), 903 deletions(-) delete mode 100644 test/unit/v1beta1/tune-api/test_tune_api.py diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index c530bfafe43..fe1b7cf68b5 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -21,17 +21,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: '3.10' - - - name: Install Katib SDK - shell: bash - run: pip install --prefer-binary -e sdk/python/v1beta1 - - - name: Install Training Operator SDK - shell: bash - run: | - pip install git+https://github.com/kubeflow/training-operator.git@v1.8-branch#subdirectory=sdk/python - pip install peft==0.3.0 datasets==2.15.0 transformers==4.38.0 + python-version: 3.11 - name: Run Python test run: make pytest diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 2453774a70d..6bc7a8936f2 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -415,8 +415,8 @@ class name in this argument. experiment.spec.max_failed_trial_count = max_failed_trial_count # If users choose to use a custom objective function. - if objective is not None: - if not base_image or not parameters: + if objective is not None or parameters is not None: + if not objective or not base_image or not parameters: raise ValueError("One of the required parameters is None") # Add metrics collector to the Katib Experiment. @@ -572,6 +572,7 @@ class name in this argument. pvc_list = self.core_api.list_namespaced_persistent_volume_claim( namespace ) + print("pvc_list:", pvc_list) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index fef18adfa0f..74eaac45445 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -57,6 +57,27 @@ def get_observation_log_response(*args, **kwargs): ) +def create_namespaced_persistent_volume_claim_response(*args, **kwargs): + if kwargs.get("namespace") == "pvc creation failed": + raise Exception("PVC creation failed") + else: + return {"metadata": {"name": "tune_test"}} + + +def list_namespaced_persistent_volume_claim_response(*args, **kwargs): + if args[0] == "pvc creation failed": + mock_pvc = Mock() + mock_pvc.metadata.name = "pvc_failed" + mock_list = Mock() + mock_list.items = [mock_pvc] + else: + mock_pvc = Mock() + mock_pvc.metadata.name = "tune_test" + mock_list = Mock() + mock_list.items = [mock_pvc] + return mock_list + + def generate_trial_template() -> V1beta1TrialTemplate: trial_spec = { "apiVersion": "batch/v1", @@ -270,6 +291,217 @@ def create_experiment( ] +# Mock classes for testing +class MockTransformerType: + __name__ = "MockTransformerType" + + +class HuggingFaceModelParams: + def __init__( + self, + model_uri=None, + transformer_type=MockTransformerType, + access_token=None, + num_labels=None, + ): + self.model_uri = model_uri + self.transformer_type = transformer_type + self.access_token = access_token + self.num_labels = num_labels + + +class HuggingFaceDatasetParams: + def __init__(self, repo_id=None, access_token=None, split=None): + self.repo_id = repo_id + self.access_token = access_token + self.split = split + + +class HuggingFaceTrainerParams: + def __init__(self, training_parameters=None, lora_config=None): + self.training_parameters = training_parameters + self.lora_config = lora_config + + +class S3DatasetParams: + def __init__( + self, + endpoint_url=None, + bucket_name=None, + file_key=None, + region_name=None, + access_key=None, + secret_key=None, + ): + self.endpoint_url = endpoint_url + self.bucket_name = bucket_name + self.file_key = file_key + self.region_name = region_name + self.access_key = access_key + self.secret_key = secret_key + + +class PyTorchJobSpec: + def __init__( + self, + elastic_policy=None, + nproc_per_node=None, + pytorch_replica_specs=None, + run_policy=None, + ): + self.elastic_policy = elastic_policy + self.nproc_per_node = nproc_per_node + self.pytorch_replica_specs = pytorch_replica_specs + self.run_policy = run_policy + + +class PyTorchJob: + def __init__( + self, + api_version=None, + kind=None, + metadata=None, + spec=PyTorchJobSpec, + status=None, + ): + self.api_version = api_version + self.kind = kind + self.metadata = metadata + self.spec = spec + self.status = status + + +test_tune_data = [ + ( + "not specify name", + { + "name": None, + "objective": lambda x: x, + "parameters": {"param": "value"}, + }, + ValueError, + ), + ( + "set both options", + { + "name": "tune_test", + "objective": lambda x: x, + "model_provider_parameters": HuggingFaceModelParams(), + }, + ValueError, + ), + ( + "set no option", + { + "name": "tune_test", + }, + ValueError, + ), + ( + "set one option but missing parameters - only objective", + { + "name": "tune_test", + "objective": lambda x: x, + }, + ValueError, + ), + ( + "set one option but missing parameters - only parameters", + { + "name": "tune_test", + "parameters": {"param": "value"}, + }, + ValueError, + ), + ( + "set one option but missing parameters - only model_provider_parameters", + { + "name": "tune_test", + "model_provider_parameters": HuggingFaceModelParams(), + }, + ValueError, + ), + ( + "set one option but missing parameters - only dataset_provider_parameters", + { + "name": "tune_test", + "dataset_provider_parameters": HuggingFaceDatasetParams(), + }, + ValueError, + ), + ( + "set one option but missing parameters - only trainer_parameters", + { + "name": "tune_test", + "trainer_parameters": HuggingFaceTrainerParams(), + }, + ValueError, + ), + ( + "invalid env_per_trial", + { + "name": "tune_test", + "objective": lambda x: x, + "parameters": {"param": "value"}, + "env_per_trial": "invalid", + }, + ValueError, + ), + ( + "invalid model_provider_parameters", + { + "name": "tune_test", + "model_provider_parameters": "invalid", + "dataset_provider_parameters": HuggingFaceDatasetParams(), + "trainer_parameters": HuggingFaceTrainerParams(), + }, + ValueError, + ), + ( + "invalid dataset_provider_parameters", + { + "name": "tune_test", + "model_provider_parameters": HuggingFaceModelParams(), + "dataset_provider_parameters": "invalid", + "trainer_parameters": HuggingFaceTrainerParams(), + }, + ValueError, + ), + ( + "pvc creation failed", + { + "name": "tune_test", + "namespace": "pvc creation failed", + "model_provider_parameters": HuggingFaceModelParams(), + "dataset_provider_parameters": HuggingFaceDatasetParams(), + "trainer_parameters": HuggingFaceTrainerParams(), + }, + RuntimeError, + ), + ( + "valid flow with custom objective", + { + "name": "tune_test", + "namespace": "tune", + "objective": lambda x: x, + "parameters": {"param": "value"}, + }, + TEST_RESULT_SUCCESS, + ), + ( + "valid flow with external models and datasets", + { + "name": "tune_test", + "namespace": "tune", + "model_provider_parameters": HuggingFaceModelParams(), + "dataset_provider_parameters": HuggingFaceDatasetParams(), + "trainer_parameters": HuggingFaceTrainerParams(), + }, + TEST_RESULT_SUCCESS, + ), +] + + @pytest.fixture def katib_client(): with patch( @@ -284,6 +516,16 @@ def katib_client(): return_value=Mock( GetObservationLog=Mock(side_effect=get_observation_log_response) ), + ), patch( + "kubernetes.client.CoreV1Api", + return_value=Mock( + create_namespaced_persistent_volume_claim=Mock( + side_effect=create_namespaced_persistent_volume_claim_response + ), + list_namespaced_persistent_volume_claim=Mock( + side_effect=list_namespaced_persistent_volume_claim_response + ), + ), ): client = KatibClient() yield client @@ -320,3 +562,57 @@ def test_get_trial_metrics(katib_client, test_name, kwargs, expected_output): except Exception as e: assert type(e) is expected_output print("test execution complete") + + +@pytest.mark.parametrize("test_name,kwargs,expected_output", test_tune_data) +def test_tune(katib_client, test_name, kwargs, expected_output): + """ + test tune function of katib client + """ + print("\n\nExecuting test:", test_name) + + JOB_PARAMETERS = { + "PyTorchJob": { + "model": "KubeflowOrgV1PyTorchJob", + "plural": "pytorchjobs", + "container": "pytorch", + "base_image": "docker.io/pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime", + } + } + + with patch( + "kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams", + HuggingFaceModelParams, + ), patch( + "kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams", + HuggingFaceDatasetParams, + ), patch( + "kubeflow.storage_initializer.hugging_face.HuggingFaceTrainerParams", + HuggingFaceTrainerParams, + ), patch( + "kubeflow.storage_initializer.s3.S3DatasetParams", S3DatasetParams + ), patch( + "kubeflow.training.models.KubeflowOrgV1PyTorchJob", PyTorchJob + ), patch( + "kubeflow.training.constants.constants.JOB_PARAMETERS", JOB_PARAMETERS + ), patch( + "kubeflow.katib.utils.utils.get_trial_substitutions_from_trainer", + return_value={"param": "value"}, + ), patch.dict( + "sys.modules", + { + "kubeflow.storage_initializer.constants": Mock(), + "kubeflow.training.models": Mock(), + "kubeflow.training.utils": Mock(), + "kubeflow.training.constants": Mock(), + }, + ), patch.object( + katib_client, "create_experiment", return_value=Mock() + ) as mock_create_experiment: + try: + katib_client.tune(**kwargs) + mock_create_experiment.assert_called_once() + assert expected_output == TEST_RESULT_SUCCESS + except Exception as e: + assert type(e) is expected_output + print("Test execution complete for:", test_name) diff --git a/test/unit/v1beta1/tune-api/test_tune_api.py b/test/unit/v1beta1/tune-api/test_tune_api.py deleted file mode 100644 index 4772c6fa7dc..00000000000 --- a/test/unit/v1beta1/tune-api/test_tune_api.py +++ /dev/null @@ -1,890 +0,0 @@ -import unittest -from unittest import TestCase -from unittest.mock import Mock, patch - -import kubeflow.katib as katib -import transformers -from kubeflow.katib import KatibClient, models -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - HuggingFaceModelParams, - HuggingFaceTrainerParams, -) -from kubeflow.training import models as training_models -from kubernetes import client -from kubernetes.client.exceptions import ApiException -from peft import LoraConfig - - -class TestTuneAPI(TestCase): - # Create an instance of the KatibClient - def setUp(self): - self.katib_client = KatibClient(namespace="default") - - # Test input - # Test for missing required parameters - def test_tune_missing_name(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name=None, - objective=lambda x: x, - parameters={ - "a": katib.search.int(min=10, max=100), - "b": katib.search.double(min=0.1, max=0.2), - }, - ) - - self.assertIn("Please specify name for the Experiment.", str(context.exception)) - - # Test for invalid hyperparameter optimization configuration - # Case 1: Set two options: 1) external models and datasets; 2) custom objective at the same time - def test_tune_invalid_with_model_provider_and_objective(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=Mock(), - objective=lambda x: x, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - def test_tune_invalid_with_dataset_provider_and_objective(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - dataset_provider_parameters=Mock(), - objective=lambda x: x, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - def test_tune_invalid_with_trainer_parameters_and_objective(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - trainer_parameters=Mock(), - objective=lambda x: x, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - def test_tune_invalid_with_model_provider_and_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=Mock(), - parameters={"lr": Mock()}, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - def test_tune_invalid_with_dataset_provider_and_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - dataset_provider_parameters=Mock(), - parameters={"lr": Mock()}, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - def test_tune_invalid_with_trainer_parameters_and_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - trainer_parameters=Mock(), - parameters={"lr": Mock()}, - ) - - self.assertIn("Invalid configuration", str(context.exception)) - - # Case 2: Missing parameters when choosing one option - def test_tune_invalid_with_only_model_provider(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=Mock(), - ) - - self.assertIn("One of the required parameters is None", str(context.exception)) - - def test_tune_invalid_with_only_dataset_provider(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - dataset_provider_parameters=Mock(), - ) - - self.assertIn("One of the required parameters is None", str(context.exception)) - - def test_tune_invalid_with_only_trainer_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - trainer_parameters=Mock(), - ) - - self.assertIn("One of the required parameters is None", str(context.exception)) - - def test_tune_invalid_with_only_objective(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - objective=lambda x: x, - ) - - self.assertIn("One of the required parameters is None", str(context.exception)) - - def test_tune_invalid_with_only_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - parameters={"lr": Mock()}, - ) - - self.assertIn("One of the required parameters is None", str(context.exception)) - - # Case 3: No parameters provided - def test_tune_no_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune(name="experiment") - - self.assertIn("Invalid configuration", str(context.exception)) - - # Test for invalid parameters - # Case 1: Invalid env_per_trial - def test_tune_invalid_env_per_trial(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - objective=lambda x: x, - parameters={ - "a": katib.search.int(min=10, max=100), - "b": katib.search.double(min=0.1, max=0.2), - }, - env_per_trial=[123], # Invalid type - ) - - self.assertIn("Incorrect value for env_per_trial", str(context.exception)) - - # Case 2: Invalid model_provider_parameters - def test_tune_invalid_model_provider_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceModelParams - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - ) - - self.assertIn( - "Model provider parameters must be an instance of HuggingFaceModelParams", - str(context.exception), - ) - - # Case 3: Invalid dataset_provider_parameters - def test_tune_invalid_dataset_provider_parameters(self): - with self.assertRaises(ValueError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - dataset_provider_parameters=123, # Invalid type, should be an instance of HuggingFaceDatasetParameters or S3DatasetParams - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - ) - - self.assertIn( - "Dataset provider parameters must be an instance of S3DatasetParams or HuggingFaceDatasetParams", - str(context.exception), - ) - - # Case 4: Invalid trainer_parameters.training_parameters - def test_tune_invalid_trainer_parameters_training_parameters(self): - with self.assertRaises(TypeError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - not_a_valid_parameter="no", - ), - lora_config=LoraConfig(), - ), - ) - - self.assertIn( - "TrainingArguments.__init__() got an unexpected keyword argument", - str(context.exception), - ) - - # Case 5: Invalid trainer_parameters.lora_config - def test_tune_invalid_trainer_parameters_lora_config(self): - with self.assertRaises(TypeError) as context: - self.katib_client.tune( - name="experiment", - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - ), - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - ), - lora_config=LoraConfig( - not_a_valid_parameter="no", - ), - ), - ) - - self.assertIn( - "LoraConfig.__init__() got an unexpected keyword argument", - str(context.exception), - ) - - # Test functionality - # Test PVC creation - # Case 1: PVC successfully created - @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") - @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") - @patch("kubeflow.katib.KatibClient.create_experiment") - def test_pvc_creation(self, mock_create_experiment, mock_list_pvc, mock_create_pvc): - mock_create_pvc.return_value = Mock() - mock_list_pvc.return_value = Mock(items=[]) - mock_create_experiment.return_value = Mock() - - exp_name = "experiment" - storage_config = { - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - } - self.katib_client.tune( - name=exp_name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - objective_metric_name="accuracy", - storage_config=storage_config, - ) - - expected_pvc_spec = models.V1PersistentVolumeClaim( - api_version="v1", - kind="PersistentVolumeClaim", - metadata={"name": exp_name, "namespace": "default"}, - spec=models.V1PersistentVolumeClaimSpec( - access_modes=storage_config["access_modes"], - resources=models.V1ResourceRequirements( - requests={"storage": storage_config["size"]} - ), - ), - ) - - mock_create_pvc.assert_called_once_with( - namespace="default", body=expected_pvc_spec - ) - - # Case 2: PVC already exists - @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") - @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") - @patch("kubeflow.katib.KatibClient.create_experiment") - def test_pvc_creation_with_existing_pvc( - self, mock_create_experiment, mock_list_pvc, mock_create_pvc - ): - # Simulate an ApiException being raised when trying to create a PVC - mock_create_pvc.side_effect = ApiException(status=409, reason="Already exists") - - # Simulate existing PVC in the list - mock_existing_pvc = Mock() - mock_existing_pvc.metadata.name = "test-pvc" - mock_list_pvc.return_value = Mock(items=[mock_existing_pvc]) - - mock_create_experiment.return_value = Mock() - - exp_name = "test-pvc" - storage_config = { - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - } - self.katib_client.tune( - name=exp_name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - objective_metric_name="accuracy", - storage_config=storage_config, - ) - - # Assert that create_namespaced_persistent_volume_claim was called once - mock_create_pvc.assert_called_once() - - # Assert that list_namespaced_persistent_volume_claim was called to check existing PVCs - mock_list_pvc.assert_called_once_with("default") - - # Ensure no exception is raised since the PVC already exists - self.assertTrue(mock_list_pvc.return_value.items[0].metadata.name == exp_name) - - # Case 3: PVC creation fails - @patch("kubernetes.client.CoreV1Api.create_namespaced_persistent_volume_claim") - @patch("kubernetes.client.CoreV1Api.list_namespaced_persistent_volume_claim") - @patch("kubeflow.katib.KatibClient.create_experiment") - def test_pvc_creation_fails( - self, mock_create_experiment, mock_list_pvc, mock_create_pvc - ): - # Simulate an ApiException being raised when trying to create a PVC - mock_create_pvc.side_effect = ApiException( - status=500, reason="Internal Server Error" - ) - - # Simulate no existing PVC in the list - mock_list_pvc.return_value = Mock(items=[]) - - mock_create_experiment.return_value = Mock() - - exp_name = "test-pvc" - storage_config = { - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - } - with self.assertRaises(RuntimeError) as context: - self.katib_client.tune( - name=exp_name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - objective_metric_name="accuracy", - storage_config=storage_config, - ) - - # Assert that the appropriate error message is raised - self.assertIn("failed to create PVC", str(context.exception)) - - # Assert that create_namespaced_persistent_volume_claim was called once - mock_create_pvc.assert_called_once() - - # Assert that list_namespaced_persistent_volume_claim was called once - mock_list_pvc.assert_called_once_with("default") - - # Test container, pod, job/pytorchjob, trial template, and experiment creation - # Case 1: Custom objective - @patch("kubeflow.katib.KatibClient.create_experiment") - def test_experiment_creation_with_custom_objective( - self, mock_create_experiment - ): - self.katib_client.tune( - name="experiment", - objective=lambda x: x, - parameters={ - "a": katib.search.int(min=10, max=100), - "b": katib.search.double(min=0.1, max=0.2), - }, - objective_metric_name="accuracy", - objective_goal=0.9, - max_trial_count=10, - parallel_trial_count=2, - max_failed_trial_count=1, - resources_per_trial={"cpu": "1", "memory": "1Gi"}, - ) - - mock_create_experiment.assert_called_once() - args, kwargs = mock_create_experiment.call_args - experiment = args[0] - - expected_container = [ - models.V1Container( - name="training-container", - image="docker.io/tensorflow/tensorflow:2.13.0", - command=["bash", "-c"], - args=[ - "\n" - "program_path=$(mktemp -d)\n" - "read -r -d '' SCRIPT << EOM\n" - "\n" - "objective=lambda x: x,\n" - "\n" - "({'a': '${trialParameters.a}', 'b': '${trialParameters.b}'})\n" - "\n" - "EOM\n" - 'printf "%s" "$SCRIPT" > $program_path/ephemeral_objective.py\n' - 'python3 -u $program_path/ephemeral_objective.py' - ], - resources=models.V1ResourceRequirements( - requests={"cpu": "1", "memory": "1Gi"}, - limits={"cpu": "1", "memory": "1Gi"}, - ), - ) - ] - - expected_pod = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - containers=expected_container, - restart_policy="Never", - ), - ) - - expected_job = client.V1Job( - api_version="batch/v1", - kind="Job", - spec=client.V1JobSpec( - template=expected_pod, - ), - ) - - expected_trial_template = models.V1beta1TrialTemplate( - primary_container_name="training-container", - trial_parameters=[ - models.V1beta1TrialParameterSpec(name="a", reference="a"), - models.V1beta1TrialParameterSpec(name="b", reference="b"), - ], - retain=False, - trial_spec=expected_job, - ) - - expected_parameters = [ - models.V1beta1ParameterSpec( - name="a", - parameter_type="int", - feasible_space=models.V1beta1FeasibleSpace(min="10", max="100"), - ), - models.V1beta1ParameterSpec( - name="b", - parameter_type="double", - feasible_space=models.V1beta1FeasibleSpace(min="0.1", max="0.2"), - ), - ] - - self.assertEqual(experiment.spec.objective.type, "maximize") - self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") - self.assertEqual(experiment.spec.objective.goal, 0.9) - self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") - self.assertEqual(experiment.spec.max_trial_count, 10) - self.assertEqual(experiment.spec.parallel_trial_count, 2) - self.assertEqual(experiment.spec.max_failed_trial_count, 1) - self.assertEqual(experiment.spec.parameters, expected_parameters) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.template.spec.containers, - expected_container, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.template, expected_pod - ) - self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) - self.assertEqual(experiment.spec.trial_template, expected_trial_template) - - # Case 2: External models and datasets - @patch("kubeflow.katib.KatibClient.create_experiment") - def test_experiment_creation_with_external_model( - self, mock_create_experiment - ): - exp_name = "experiment" - self.katib_client.tune( - name=exp_name, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # Use 3000 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_tune_api", - save_strategy="no", - learning_rate=katib.search.double(min=1e-05, max=5e-05), - num_train_epochs=1, - logging_dir="test_tune_api/logs", - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=katib.search.int(min=8, max=32), - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - objective_metric_name="accuracy", - objective_goal=0.9, - max_trial_count=10, - parallel_trial_count=2, - max_failed_trial_count=1, - resources_per_trial=katib.TrainerResources( - num_workers=3, - num_procs_per_worker=1, - resources_per_worker={"cpu": "1", "memory": "1Gi"}, - ), - ) - - mock_create_experiment.assert_called_once() - args, kwargs = mock_create_experiment.call_args - experiment = args[0] - - expected_init_container = [ - models.V1Container( - name="storage-initializer", - image="docker.io/kubeflow/storage-initializer", - args=[ - "--model_provider", - "hf", - "--model_provider_parameters", - '{"model_uri": "hf://google-bert/bert-base-cased", "transformer_type": "AutoModelForSequenceClassification", ' - '"access_token": null, "num_labels": 5}', - "--dataset_provider", - "hf", - "--dataset_provider_parameters", - '{"repo_id": "yelp_review_full", "access_token": null, "split": "train[:8]"}', - ], - volume_mounts=[ - training_models.V1VolumeMount( - name="storage-initializer", - mount_path="/workspace", - ) - ], - ) - ] - - expected_container = [ - models.V1Container( - name="pytorch", - image="docker.io/kubeflow/trainer-huggingface", - args=[ - "--model_uri", - "hf://google-bert/bert-base-cased", - "--transformer_type", - "AutoModelForSequenceClassification", - "--num_labels", - "5", - "--model_dir", - "/workspace/model", - "--dataset_dir", - "/workspace/dataset", - "--lora_config", - '\'{"peft_type": "LORA", "base_model_name_or_path": null, "task_type": null, ' - '"inference_mode": false, "r": "${trialParameters.r}", "target_modules": null, ' - '"lora_alpha": 8, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", ' - '"modules_to_save": null, "init_lora_weights": true}\'', - "--training_parameters", - '\'{"output_dir": "test_tune_api", "overwrite_output_dir": false, "do_train": ' - 'false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", ' - '"prediction_loss_only": false, "per_device_train_batch_size": 8, ' - '"per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, ' - '"per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 1, ' - '"eval_accumulation_steps": null, "eval_delay": 0, "learning_rate": ' - '"${trialParameters.learning_rate}", "weight_decay": 0.0, "adam_beta1": 0.9, ' - '"adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, ' - '"num_train_epochs": 1, "max_steps": -1, "lr_scheduler_type": "linear", ' - '"lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, ' - '"log_level": "passive", "log_level_replica": "warning", "log_on_each_node": ' - 'true, "logging_dir": "test_tune_api/logs", "logging_strategy": "steps", ' - '"logging_first_step": false, "logging_steps": 500, "logging_nan_inf_filter": ' - 'true, "save_strategy": "no", "save_steps": 500, "save_total_limit": null, ' - '"save_safetensors": true, "save_on_each_node": false, "save_only_model": ' - 'false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": ' - '42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": ' - 'false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": ' - '"auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, ' - '"local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, ' - '"tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, ' - '"eval_steps": null, "dataloader_num_workers": 0, "dataloader_prefetch_factor": ' - 'null, "past_index": -1, "run_name": "test_tune_api", "disable_tqdm": false, ' - '"remove_unused_columns": true, "label_names": null, "load_best_model_at_end": ' - 'false, "metric_for_best_model": null, "greater_is_better": null, ' - '"ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, ' - '"fsdp_config": {"min_num_params": 0, "xla": false, "xla_fsdp_v2": false, ' - '"xla_fsdp_grad_ckpt": false}, "fsdp_transformer_layer_cls_to_wrap": null, ' - '"accelerator_config": {"split_batches": false, "dispatch_batches": null, ' - '"even_batches": true, "use_seedable_sampler": true}, "deepspeed": null, ' - '"label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, ' - '"adafactor": false, "group_by_length": false, "length_column_name": "length", ' - '"report_to": ["tensorboard"], "ddp_find_unused_parameters": null, ' - '"ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, ' - '"dataloader_pin_memory": true, "dataloader_persistent_workers": false, ' - '"skip_memory_metrics": true, "use_legacy_prediction_loop": false, ' - '"push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, ' - '"hub_strategy": "every_save", "hub_token": "", "hub_private_repo": ' - 'false, "hub_always_push": false, "gradient_checkpointing": false, ' - '"gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, ' - '"fp16_backend": "auto", "push_to_hub_model_id": null, ' - '"push_to_hub_organization": null, "push_to_hub_token": "", ' - '"mp_parameters": "", "auto_find_batch_size": false, "full_determinism": ' - 'false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, ' - '"torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": ' - 'null, "dispatch_batches": null, "split_batches": null, "include_tokens_per_' - 'second": false, "include_num_input_tokens_seen": false, ' - '"neftune_noise_alpha": null}\'', - ], - resources=models.V1ResourceRequirements( - requests={"cpu": "1", "memory": "1Gi"}, - limits={"cpu": "1", "memory": "1Gi"}, - ), - volume_mounts=[ - training_models.V1VolumeMount( - name="storage-initializer", - mount_path="/workspace", - ) - ], - ) - ] - - expected_master_pod = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - init_containers=expected_init_container, - containers=expected_container, - volumes=[ - models.V1Volume( - name="storage-initializer", - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=exp_name - ), - ) - ], - ), - ) - - expected_worker_pod = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - containers=expected_container, - volumes=[ - models.V1Volume( - name="storage-initializer", - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=exp_name - ), - ) - ], - ), - ) - - expected_job = training_models.KubeflowOrgV1PyTorchJob( - api_version="kubeflow.org/v1", - kind="PyTorchJob", - spec=training_models.KubeflowOrgV1PyTorchJobSpec( - run_policy=training_models.KubeflowOrgV1RunPolicy( - clean_pod_policy=None - ), - pytorch_replica_specs={ - "Master": training_models.KubeflowOrgV1ReplicaSpec( - replicas=1, - template=expected_master_pod, - ), - "Worker": training_models.KubeflowOrgV1ReplicaSpec( - replicas=2, - template=expected_worker_pod, - ), - }, - nproc_per_node="1", - ), - ) - - expected_trial_template = models.V1beta1TrialTemplate( - primary_container_name="pytorch", - trial_parameters=[ - models.V1beta1TrialParameterSpec( - name="learning_rate", reference="learning_rate" - ), - models.V1beta1TrialParameterSpec(name="r", reference="r"), - ], - retain=False, - trial_spec=expected_job, - ) - - expected_parameters = [ - models.V1beta1ParameterSpec( - name="learning_rate", - parameter_type="double", - feasible_space=models.V1beta1FeasibleSpace(min="1e-05", max="5e-05"), - ), - models.V1beta1ParameterSpec( - name="r", - parameter_type="int", - feasible_space=models.V1beta1FeasibleSpace(min="8", max="32"), - ), - ] - - self.assertEqual(experiment.spec.objective.type, "maximize") - self.assertEqual(experiment.spec.objective.objective_metric_name, "accuracy") - self.assertEqual(experiment.spec.objective.goal, 0.9) - self.assertEqual(experiment.spec.algorithm.algorithm_name, "random") - self.assertEqual(experiment.spec.max_trial_count, 10) - self.assertEqual(experiment.spec.parallel_trial_count, 2) - self.assertEqual(experiment.spec.max_failed_trial_count, 1) - self.assertEqual(experiment.spec.parameters, expected_parameters) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Master" - ].template.spec.init_containers, - expected_init_container, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Master" - ].template.spec.containers, - expected_container, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Master" - ].replicas, - 1, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Master" - ].template, - expected_master_pod, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Worker" - ].template.spec.containers, - expected_container, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Worker" - ].replicas, - 2, - ) - self.assertEqual( - experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ - "Worker" - ].template, - expected_worker_pod, - ) - self.assertEqual(experiment.spec.trial_template.trial_spec, expected_job) - self.assertEqual(experiment.spec.trial_template, expected_trial_template) - -if __name__ == "__main__": - unittest.main() From 9c0a9e6de76467926a4d28b7f757b898a4e940d8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 9 Sep 2024 00:36:07 -0700 Subject: [PATCH 09/22] undo changes to Makefile Signed-off-by: helenxie-bit --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index e3d62a8ca38..a6708de7f5b 100755 --- a/Makefile +++ b/Makefile @@ -172,7 +172,6 @@ pytest: prepare-pytest prepare-pytest-testdata pytest ./test/unit/v1beta1/suggestion --ignore=./test/unit/v1beta1/suggestion/test_skopt_service.py pytest ./test/unit/v1beta1/earlystopping pytest ./test/unit/v1beta1/metricscollector - pytest ./test/unit/v1beta1/tune-api cp ./pkg/apis/manager/v1beta1/python/api_pb2.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2.py cp ./pkg/apis/manager/v1beta1/python/api_pb2_grpc.py ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py sed -i "s/api_pb2/kubeflow\.katib\.katib_api_pb2/g" ./sdk/python/v1beta1/kubeflow/katib/katib_api_pb2_grpc.py From f5c4bce662b1d314dfab0d0e38e2267c9beeaa88 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 9 Sep 2024 00:40:02 -0700 Subject: [PATCH 10/22] delete debug code Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6bc7a8936f2..c6a4f95c126 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -416,7 +416,7 @@ class name in this argument. # If users choose to use a custom objective function. if objective is not None or parameters is not None: - if not objective or not base_image or not parameters: + if objective is None or base_image is None or parameters is None: raise ValueError("One of the required parameters is None") # Add metrics collector to the Katib Experiment. @@ -507,9 +507,9 @@ class name in this argument. # If users choose to use external models and datasets. else: if ( - not model_provider_parameters - or not dataset_provider_parameters - or not trainer_parameters + model_provider_parameters is None + or dataset_provider_parameters is None + or trainer_parameters is None ): raise ValueError("One of the required parameters is None") @@ -572,7 +572,6 @@ class name in this argument. pvc_list = self.core_api.list_namespaced_persistent_volume_claim( namespace ) - print("pvc_list:", pvc_list) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: if pvc.metadata.name == name: From 5ddcc30e8a2a3d87e1154fcba13bf2e082181b37 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 9 Sep 2024 00:42:56 -0700 Subject: [PATCH 11/22] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 74eaac45445..0755d6d533a 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -615,4 +615,4 @@ def test_tune(katib_client, test_name, kwargs, expected_output): assert expected_output == TEST_RESULT_SUCCESS except Exception as e: assert type(e) is expected_output - print("Test execution complete for:", test_name) + print("test execution complete") From 4909456179735d06741fcf1dcf98879cd6b5178a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 10 Sep 2024 20:42:13 -0700 Subject: [PATCH 12/22] update unit test Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client_test.py | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 0755d6d533a..f8ded73eeff 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -1,6 +1,6 @@ import multiprocessing from typing import List, Optional -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import pytest @@ -341,12 +341,12 @@ def __init__( self.secret_key = secret_key -class PyTorchJobSpec: +class KubeflowOrgV1PyTorchJobSpec: def __init__( self, elastic_policy=None, nproc_per_node=None, - pytorch_replica_specs=None, + pytorch_replica_specs={}, run_policy=None, ): self.elastic_policy = elastic_policy @@ -355,13 +355,13 @@ def __init__( self.run_policy = run_policy -class PyTorchJob: +class KubeflowOrgV1PyTorchJob: def __init__( self, api_version=None, kind=None, metadata=None, - spec=PyTorchJobSpec, + spec=KubeflowOrgV1PyTorchJobSpec, status=None, ): self.api_version = api_version @@ -571,6 +571,7 @@ def test_tune(katib_client, test_name, kwargs, expected_output): """ print("\n\nExecuting test:", test_name) + PYTORCHJOB_KIND = "PyTorchJob" JOB_PARAMETERS = { "PyTorchJob": { "model": "KubeflowOrgV1PyTorchJob", @@ -580,7 +581,20 @@ def test_tune(katib_client, test_name, kwargs, expected_output): } } - with patch( + with patch.dict( + "sys.modules", + { + "kubeflow.storage_initializer": Mock(), + "kubeflow.storage_initializer.hugging_face": Mock(), + "kubeflow.storage_initializer.s3": Mock(), + "kubeflow.storage_initializer.constants": Mock(), + "kubeflow.training": MagicMock(), + "kubeflow.training.models": Mock(), + "kubeflow.training.utils": Mock(), + "kubeflow.training.constants": Mock(), + "kubeflow.training.constants.constants": Mock(), + }), \ + patch( "kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams", HuggingFaceModelParams, ), patch( @@ -592,21 +606,16 @@ def test_tune(katib_client, test_name, kwargs, expected_output): ), patch( "kubeflow.storage_initializer.s3.S3DatasetParams", S3DatasetParams ), patch( - "kubeflow.training.models.KubeflowOrgV1PyTorchJob", PyTorchJob + "kubeflow.training.models.KubeflowOrgV1PyTorchJob", KubeflowOrgV1PyTorchJob ), patch( "kubeflow.training.constants.constants.JOB_PARAMETERS", JOB_PARAMETERS + ), patch( + "kubeflow.training.constants.constants.PYTORCHJOB_KIND", PYTORCHJOB_KIND ), patch( "kubeflow.katib.utils.utils.get_trial_substitutions_from_trainer", return_value={"param": "value"}, - ), patch.dict( - "sys.modules", - { - "kubeflow.storage_initializer.constants": Mock(), - "kubeflow.training.models": Mock(), - "kubeflow.training.utils": Mock(), - "kubeflow.training.constants": Mock(), - }, - ), patch.object( + ), \ + patch.object( katib_client, "create_experiment", return_value=Mock() ) as mock_create_experiment: try: From 1e788409ad8fee5f697b452267acaf6ce6ec6d3a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 10 Sep 2024 20:44:12 -0700 Subject: [PATCH 13/22] fix format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index f8ded73eeff..7a833b6dcf3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -583,7 +583,7 @@ def test_tune(katib_client, test_name, kwargs, expected_output): with patch.dict( "sys.modules", - { + { "kubeflow.storage_initializer": Mock(), "kubeflow.storage_initializer.hugging_face": Mock(), "kubeflow.storage_initializer.s3": Mock(), @@ -593,8 +593,8 @@ def test_tune(katib_client, test_name, kwargs, expected_output): "kubeflow.training.utils": Mock(), "kubeflow.training.constants": Mock(), "kubeflow.training.constants.constants": Mock(), - }), \ - patch( + }, + ), patch( "kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams", HuggingFaceModelParams, ), patch( @@ -614,8 +614,7 @@ def test_tune(katib_client, test_name, kwargs, expected_output): ), patch( "kubeflow.katib.utils.utils.get_trial_substitutions_from_trainer", return_value={"param": "value"}, - ), \ - patch.object( + ), patch.object( katib_client, "create_experiment", return_value=Mock() ) as mock_create_experiment: try: From e68fe38f5f994526146defc38afb66a6d79cd0aa Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 11 Sep 2024 22:53:08 -0600 Subject: [PATCH 14/22] update the version of training operator Signed-off-by: helenxie-bit --- sdk/python/v1beta1/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/setup.py b/sdk/python/v1beta1/setup.py index 78ae02aa739..ae7e1365363 100644 --- a/sdk/python/v1beta1/setup.py +++ b/sdk/python/v1beta1/setup.py @@ -86,6 +86,6 @@ ], install_requires=REQUIRES, extras_require={ - "huggingface": ["kubeflow-training[huggingface]==1.8.0"], + "huggingface": ["kubeflow-training[huggingface]==1.8.1"], }, ) From d3a340458781b8cff7a52ec55e61bb469c18092b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 8 Oct 2024 22:04:32 -0700 Subject: [PATCH 15/22] adjust 'list_namespaced_persistent_volume_claim' to be called with keyword argument Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index c6a4f95c126..707639aa8ba 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -570,7 +570,7 @@ class name in this argument. ) except Exception as e: pvc_list = self.core_api.list_namespaced_persistent_volume_claim( - namespace + namespace=namespace ) # Check if the PVC with the specified name exists. for pvc in pvc_list.items: diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 7a833b6dcf3..afe9c8f2ee5 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -65,7 +65,7 @@ def create_namespaced_persistent_volume_claim_response(*args, **kwargs): def list_namespaced_persistent_volume_claim_response(*args, **kwargs): - if args[0] == "pvc creation failed": + if kwargs.get("namespace") == "pvc creation failed": mock_pvc = Mock() mock_pvc.metadata.name = "pvc_failed" mock_list = Mock() From 6d5c20ef381799162e9ec7d2396ccc17f34e01d0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 8 Oct 2024 22:21:57 -0700 Subject: [PATCH 16/22] create constant for namespace when check pvc creation error Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index afe9c8f2ee5..391956a81ff 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -18,6 +18,8 @@ from kubeflow.katib.constants import constants from kubernetes.client import V1ObjectMeta +PVC_FAILED = "pvc creation failed" + TEST_RESULT_SUCCESS = "success" @@ -58,14 +60,14 @@ def get_observation_log_response(*args, **kwargs): def create_namespaced_persistent_volume_claim_response(*args, **kwargs): - if kwargs.get("namespace") == "pvc creation failed": + if kwargs.get("namespace") == PVC_FAILED: raise Exception("PVC creation failed") else: return {"metadata": {"name": "tune_test"}} def list_namespaced_persistent_volume_claim_response(*args, **kwargs): - if kwargs.get("namespace") == "pvc creation failed": + if kwargs.get("namespace") == PVC_FAILED: mock_pvc = Mock() mock_pvc.metadata.name = "pvc_failed" mock_list = Mock() @@ -471,7 +473,7 @@ def __init__( "pvc creation failed", { "name": "tune_test", - "namespace": "pvc creation failed", + "namespace": PVC_FAILED, "model_provider_parameters": HuggingFaceModelParams(), "dataset_provider_parameters": HuggingFaceDatasetParams(), "trainer_parameters": HuggingFaceTrainerParams(), From b25f7ba37a60710191666b936ed47598c2de4ac2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 8 Oct 2024 22:28:49 -0700 Subject: [PATCH 17/22] add type check for 'trainer_parameters' Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 6 ++++++ .../v1beta1/kubeflow/katib/api/katib_client_test.py | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 707639aa8ba..6a742831907 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -521,6 +521,7 @@ class name in this argument. from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, + HuggingFaceTrainerParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training import models as training_models @@ -598,6 +599,11 @@ class name in this argument. "Dataset provider parameters must be an instance of S3DatasetParams " "or HuggingFaceDatasetParams." ) + + if not isinstance(trainer_parameters, HuggingFaceTrainerParams): + raise ValueError( + "Trainer parameters must be an instance of HuggingFaceTrainerParams." + ) # Iterate over input parameters and do substitutions. experiment_params = [] diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 391956a81ff..7217ec9345c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -469,6 +469,16 @@ def __init__( }, ValueError, ), + ( + "invalid trainer_parameters", + { + "name": "tune_test", + "model_provider_parameters": HuggingFaceModelParams(), + "dataset_provider_parameters": HuggingFaceDatasetParams(), + "trainer_parameters": "invalid", + }, + ValueError, + ), ( "pvc creation failed", { From 3ebbe76e83704cd8a40628602a64e2c21f744a99 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 8 Oct 2024 22:40:17 -0700 Subject: [PATCH 18/22] fix format Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 6a742831907..01b7fdd69c3 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -599,7 +599,7 @@ class name in this argument. "Dataset provider parameters must be an instance of S3DatasetParams " "or HuggingFaceDatasetParams." ) - + if not isinstance(trainer_parameters, HuggingFaceTrainerParams): raise ValueError( "Trainer parameters must be an instance of HuggingFaceTrainerParams." From 0498237bae09d78eaae2d977898feeac74268804 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 10 Oct 2024 11:20:10 -0700 Subject: [PATCH 19/22] update test names Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client_test.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 7217ec9345c..840136c0ed9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -375,7 +375,7 @@ def __init__( test_tune_data = [ ( - "not specify name", + "missing name", { "name": None, "objective": lambda x: x, @@ -384,7 +384,7 @@ def __init__( ValueError, ), ( - "set both options", + "invalid hybrid parameters - objective and model_provider_parameters", { "name": "tune_test", "objective": lambda x: x, @@ -393,14 +393,14 @@ def __init__( ValueError, ), ( - "set no option", + "missing parameters", { "name": "tune_test", }, ValueError, ), ( - "set one option but missing parameters - only objective", + "missing parameters in custom objective tuning - lack parameters", { "name": "tune_test", "objective": lambda x: x, @@ -408,7 +408,7 @@ def __init__( ValueError, ), ( - "set one option but missing parameters - only parameters", + "missing parameters in custom objective tuning - lack objective", { "name": "tune_test", "parameters": {"param": "value"}, @@ -416,7 +416,7 @@ def __init__( ValueError, ), ( - "set one option but missing parameters - only model_provider_parameters", + "missing parameters in external model tuning - lack dataset_provider_parameters and trainer_parameters", { "name": "tune_test", "model_provider_parameters": HuggingFaceModelParams(), @@ -424,7 +424,7 @@ def __init__( ValueError, ), ( - "set one option but missing parameters - only dataset_provider_parameters", + "missing parameters in external model tuning - lack model_provider_parameters and trainer_parameters", { "name": "tune_test", "dataset_provider_parameters": HuggingFaceDatasetParams(), @@ -432,7 +432,7 @@ def __init__( ValueError, ), ( - "set one option but missing parameters - only trainer_parameters", + "missing parameters in external model tuning - lack model_provider_parameters and dataset_provider_parameters", { "name": "tune_test", "trainer_parameters": HuggingFaceTrainerParams(), @@ -491,7 +491,7 @@ def __init__( RuntimeError, ), ( - "valid flow with custom objective", + "valid flow with custom objective tuning", { "name": "tune_test", "namespace": "tune", @@ -501,7 +501,7 @@ def __init__( TEST_RESULT_SUCCESS, ), ( - "valid flow with external models and datasets", + "valid flow with external model tuning", { "name": "tune_test", "namespace": "tune", From 15f6a7af05163e1edf19c90b7c41d178255f21f2 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 10 Oct 2024 11:24:05 -0700 Subject: [PATCH 20/22] fix format Signed-off-by: helenxie-bit --- .../v1beta1/kubeflow/katib/api/katib_client_test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 840136c0ed9..6f91e973f5f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -416,7 +416,8 @@ def __init__( ValueError, ), ( - "missing parameters in external model tuning - lack dataset_provider_parameters and trainer_parameters", + "missing parameters in external model tuning - lack dataset_provider_parameters " + "and trainer_parameters", { "name": "tune_test", "model_provider_parameters": HuggingFaceModelParams(), @@ -424,7 +425,8 @@ def __init__( ValueError, ), ( - "missing parameters in external model tuning - lack model_provider_parameters and trainer_parameters", + "missing parameters in external model tuning - lack model_provider_parameters " + "and trainer_parameters", { "name": "tune_test", "dataset_provider_parameters": HuggingFaceDatasetParams(), @@ -432,7 +434,8 @@ def __init__( ValueError, ), ( - "missing parameters in external model tuning - lack model_provider_parameters and dataset_provider_parameters", + "missing parameters in external model tuning - lack model_provider_parameters " + "and dataset_provider_parameters", { "name": "tune_test", "trainer_parameters": HuggingFaceTrainerParams(), From 86db6d5ebbbd6dd4fe499b041947997bc963d2cb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 21 Oct 2024 21:20:20 -0700 Subject: [PATCH 21/22] add verification for key Experiment information & add 'kubeflow-training[huggingface' into dependencies Signed-off-by: helenxie-bit --- .../kubeflow/katib/api/katib_client_test.py | 300 +++++++++--------- test/unit/v1beta1/requirements.txt | 1 + 2 files changed, 152 insertions(+), 149 deletions(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 6f91e973f5f..8ebb1b9783c 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -1,9 +1,11 @@ import multiprocessing from typing import List, Optional -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import Mock, patch +import kubeflow.katib as katib import kubeflow.katib.katib_api_pb2 as katib_api_pb2 import pytest +import transformers from kubeflow.katib import ( KatibClient, V1beta1AlgorithmSpec, @@ -16,6 +18,11 @@ V1beta1TrialTemplate, ) from kubeflow.katib.constants import constants +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) from kubernetes.client import V1ObjectMeta PVC_FAILED = "pvc creation failed" @@ -293,93 +300,13 @@ def create_experiment( ] -# Mock classes for testing -class MockTransformerType: - __name__ = "MockTransformerType" - - -class HuggingFaceModelParams: - def __init__( - self, - model_uri=None, - transformer_type=MockTransformerType, - access_token=None, - num_labels=None, - ): - self.model_uri = model_uri - self.transformer_type = transformer_type - self.access_token = access_token - self.num_labels = num_labels - - -class HuggingFaceDatasetParams: - def __init__(self, repo_id=None, access_token=None, split=None): - self.repo_id = repo_id - self.access_token = access_token - self.split = split - - -class HuggingFaceTrainerParams: - def __init__(self, training_parameters=None, lora_config=None): - self.training_parameters = training_parameters - self.lora_config = lora_config - - -class S3DatasetParams: - def __init__( - self, - endpoint_url=None, - bucket_name=None, - file_key=None, - region_name=None, - access_key=None, - secret_key=None, - ): - self.endpoint_url = endpoint_url - self.bucket_name = bucket_name - self.file_key = file_key - self.region_name = region_name - self.access_key = access_key - self.secret_key = secret_key - - -class KubeflowOrgV1PyTorchJobSpec: - def __init__( - self, - elastic_policy=None, - nproc_per_node=None, - pytorch_replica_specs={}, - run_policy=None, - ): - self.elastic_policy = elastic_policy - self.nproc_per_node = nproc_per_node - self.pytorch_replica_specs = pytorch_replica_specs - self.run_policy = run_policy - - -class KubeflowOrgV1PyTorchJob: - def __init__( - self, - api_version=None, - kind=None, - metadata=None, - spec=KubeflowOrgV1PyTorchJobSpec, - status=None, - ): - self.api_version = api_version - self.kind = kind - self.metadata = metadata - self.spec = spec - self.status = status - - test_tune_data = [ ( "missing name", { "name": None, "objective": lambda x: x, - "parameters": {"param": "value"}, + "parameters": {"a": katib.search.int(min=10, max=100)}, }, ValueError, ), @@ -388,12 +315,16 @@ def __init__( { "name": "tune_test", "objective": lambda x: x, - "model_provider_parameters": HuggingFaceModelParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), }, ValueError, ), ( - "missing parameters", + "missing parameters - not setting custom objective tuning or external model tuning", { "name": "tune_test", }, @@ -411,7 +342,7 @@ def __init__( "missing parameters in custom objective tuning - lack objective", { "name": "tune_test", - "parameters": {"param": "value"}, + "parameters": {"a": katib.search.int(min=10, max=100)}, }, ValueError, ), @@ -420,7 +351,11 @@ def __init__( "and trainer_parameters", { "name": "tune_test", - "model_provider_parameters": HuggingFaceModelParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), }, ValueError, ), @@ -429,7 +364,10 @@ def __init__( "and trainer_parameters", { "name": "tune_test", - "dataset_provider_parameters": HuggingFaceDatasetParams(), + "dataset_provider_parameters": HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:3000]", + ), }, ValueError, ), @@ -438,7 +376,12 @@ def __init__( "and dataset_provider_parameters", { "name": "tune_test", - "trainer_parameters": HuggingFaceTrainerParams(), + "trainer_parameters": HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + ), + ), }, ValueError, ), @@ -447,7 +390,7 @@ def __init__( { "name": "tune_test", "objective": lambda x: x, - "parameters": {"param": "value"}, + "parameters": {"a": katib.search.int(min=10, max=100)}, "env_per_trial": "invalid", }, ValueError, @@ -457,8 +400,16 @@ def __init__( { "name": "tune_test", "model_provider_parameters": "invalid", - "dataset_provider_parameters": HuggingFaceDatasetParams(), - "trainer_parameters": HuggingFaceTrainerParams(), + "dataset_provider_parameters": HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:3000]", + ), + "trainer_parameters": HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + ), + ), }, ValueError, ), @@ -466,9 +417,18 @@ def __init__( "invalid dataset_provider_parameters", { "name": "tune_test", - "model_provider_parameters": HuggingFaceModelParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), "dataset_provider_parameters": "invalid", - "trainer_parameters": HuggingFaceTrainerParams(), + "trainer_parameters": HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + ), + ), }, ValueError, ), @@ -476,8 +436,15 @@ def __init__( "invalid trainer_parameters", { "name": "tune_test", - "model_provider_parameters": HuggingFaceModelParams(), - "dataset_provider_parameters": HuggingFaceDatasetParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + "dataset_provider_parameters": HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:3000]", + ), "trainer_parameters": "invalid", }, ValueError, @@ -487,9 +454,21 @@ def __init__( { "name": "tune_test", "namespace": PVC_FAILED, - "model_provider_parameters": HuggingFaceModelParams(), - "dataset_provider_parameters": HuggingFaceDatasetParams(), - "trainer_parameters": HuggingFaceTrainerParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + "dataset_provider_parameters": HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:3000]", + ), + "trainer_parameters": HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + ), + ), }, RuntimeError, ), @@ -497,9 +476,8 @@ def __init__( "valid flow with custom objective tuning", { "name": "tune_test", - "namespace": "tune", "objective": lambda x: x, - "parameters": {"param": "value"}, + "parameters": {"a": katib.search.int(min=10, max=100)}, }, TEST_RESULT_SUCCESS, ), @@ -507,10 +485,21 @@ def __init__( "valid flow with external model tuning", { "name": "tune_test", - "namespace": "tune", - "model_provider_parameters": HuggingFaceModelParams(), - "dataset_provider_parameters": HuggingFaceDatasetParams(), - "trainer_parameters": HuggingFaceTrainerParams(), + "model_provider_parameters": HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + "dataset_provider_parameters": HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:3000]", + ), + "trainer_parameters": HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_tune_api", + learning_rate=katib.search.double(min=1e-05, max=5e-05), + ), + ), }, TEST_RESULT_SUCCESS, ), @@ -586,56 +575,69 @@ def test_tune(katib_client, test_name, kwargs, expected_output): """ print("\n\nExecuting test:", test_name) - PYTORCHJOB_KIND = "PyTorchJob" - JOB_PARAMETERS = { - "PyTorchJob": { - "model": "KubeflowOrgV1PyTorchJob", - "plural": "pytorchjobs", - "container": "pytorch", - "base_image": "docker.io/pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime", - } - } - - with patch.dict( - "sys.modules", - { - "kubeflow.storage_initializer": Mock(), - "kubeflow.storage_initializer.hugging_face": Mock(), - "kubeflow.storage_initializer.s3": Mock(), - "kubeflow.storage_initializer.constants": Mock(), - "kubeflow.training": MagicMock(), - "kubeflow.training.models": Mock(), - "kubeflow.training.utils": Mock(), - "kubeflow.training.constants": Mock(), - "kubeflow.training.constants.constants": Mock(), - }, - ), patch( - "kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams", - HuggingFaceModelParams, - ), patch( - "kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams", - HuggingFaceDatasetParams, - ), patch( - "kubeflow.storage_initializer.hugging_face.HuggingFaceTrainerParams", - HuggingFaceTrainerParams, - ), patch( - "kubeflow.storage_initializer.s3.S3DatasetParams", S3DatasetParams - ), patch( - "kubeflow.training.models.KubeflowOrgV1PyTorchJob", KubeflowOrgV1PyTorchJob - ), patch( - "kubeflow.training.constants.constants.JOB_PARAMETERS", JOB_PARAMETERS - ), patch( - "kubeflow.training.constants.constants.PYTORCHJOB_KIND", PYTORCHJOB_KIND - ), patch( - "kubeflow.katib.utils.utils.get_trial_substitutions_from_trainer", - return_value={"param": "value"}, - ), patch.object( + with patch.object( katib_client, "create_experiment", return_value=Mock() ) as mock_create_experiment: try: katib_client.tune(**kwargs) mock_create_experiment.assert_called_once() - assert expected_output == TEST_RESULT_SUCCESS + + if expected_output == TEST_RESULT_SUCCESS: + assert expected_output == TEST_RESULT_SUCCESS + call_args = mock_create_experiment.call_args + experiment = call_args[0][0] + + if test_name == "valid flow with custom objective tuning": + # Verify input_params + args_content = "".join( + experiment.spec.trial_template.trial_spec.spec.template.spec.containers[ + 0 + ].args + ) + assert "'a': '${trialParameters.a}'" in args_content + # Verify trial_params + assert experiment.spec.trial_template.trial_parameters == [ + V1beta1TrialParameterSpec(name="a", reference="a"), + ] + # Verify experiment_params + assert experiment.spec.parameters == [ + V1beta1ParameterSpec( + name="a", + parameter_type="int", + feasible_space=V1beta1FeasibleSpace(min="10", max="100"), + ), + ] + + elif test_name == "valid flow with external model tuning": + # Verify input_params + args_content = "".join( + experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[ + "Master" + ] + .template.spec.containers[0] + .args + ) + assert ( + '"learning_rate": "${trialParameters.learning_rate}"' + in args_content + ) + # Verify trial_params + assert experiment.spec.trial_template.trial_parameters == [ + V1beta1TrialParameterSpec( + name="learning_rate", reference="learning_rate" + ), + ] + # Verify experiment_params + assert experiment.spec.parameters == [ + V1beta1ParameterSpec( + name="learning_rate", + parameter_type="double", + feasible_space=V1beta1FeasibleSpace( + min="1e-05", max="5e-05" + ), + ), + ] + except Exception as e: assert type(e) is expected_output print("test execution complete") diff --git a/test/unit/v1beta1/requirements.txt b/test/unit/v1beta1/requirements.txt index 2aa91b337e3..74402202c17 100644 --- a/test/unit/v1beta1/requirements.txt +++ b/test/unit/v1beta1/requirements.txt @@ -1,2 +1,3 @@ grpcio-testing==1.41.1 pytest==7.2.0 +kubeflow-training[huggingface]==1.8.1 From b24b44fc3c54a58173d007f4cf9f6a1311e677d3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 21 Oct 2024 21:44:23 -0700 Subject: [PATCH 22/22] rerun tests Signed-off-by: helenxie-bit --- sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py index 8ebb1b9783c..229a1af3b4f 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py @@ -324,7 +324,7 @@ def create_experiment( ValueError, ), ( - "missing parameters - not setting custom objective tuning or external model tuning", + "missing parameters - no custom objective or external model tuning", { "name": "tune_test", },