-
Notifications
You must be signed in to change notification settings - Fork 455
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Resume experiment with extra trials from last checkpoint #952
Changes from all commits
0382ae6
eaebf42
93d3f47
0252d2b
421ad52
96f7f23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,7 @@ type Client interface { | |
GetClient() client.Client | ||
GetExperimentList(namespace ...string) (*experimentsv1alpha3.ExperimentList, error) | ||
CreateExperiment(experiment *experimentsv1alpha3.Experiment, namespace ...string) error | ||
UpdateExperiment(experiment *experimentsv1alpha3.Experiment, namespace ...string) error | ||
DeleteExperiment(experiment *experimentsv1alpha3.Experiment, namespace ...string) error | ||
GetExperiment(name string, namespace ...string) (*experimentsv1alpha3.Experiment, error) | ||
GetConfigMap(name string, namespace ...string) (map[string]string, error) | ||
|
@@ -123,6 +124,14 @@ func (k *KatibClient) CreateExperiment(experiment *experimentsv1alpha3.Experimen | |
return nil | ||
} | ||
|
||
func (k *KatibClient) UpdateExperiment(experiment *experimentsv1alpha3.Experiment, namespace ...string) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this method used for UI? and it seems There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure about it. Shall we do this separately? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this method is not useful in your PR topic, I think we'd better remove it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. this method is currently used in resume_e2e_experiment.go script in this PR to update the experiment. |
||
|
||
if err := k.client.Update(context.Background(), experiment); err != nil { | ||
return err | ||
} | ||
return nil | ||
} | ||
|
||
func (k *KatibClient) DeleteExperiment(experiment *experimentsv1alpha3.Experiment, namespace ...string) error { | ||
|
||
if err := k.client.Delete(context.Background(), experiment); err != nil { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package main | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
"io/ioutil" | ||
"log" | ||
"os" | ||
"time" | ||
|
||
k8syaml "k8s.io/apimachinery/pkg/util/yaml" | ||
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
|
||
commonv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/common/v1alpha3" | ||
experimentsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1alpha3" | ||
"github.com/kubeflow/katib/pkg/util/v1alpha3/katibclient" | ||
) | ||
|
||
const ( | ||
timeout = 30 * time.Minute | ||
) | ||
|
||
func verifyResult(exp *experimentsv1alpha3.Experiment) (*float64, error) { | ||
if len(exp.Status.CurrentOptimalTrial.ParameterAssignments) == 0 { | ||
return nil, fmt.Errorf("Best parameter assignments not updated in status") | ||
} | ||
|
||
if len(exp.Status.CurrentOptimalTrial.Observation.Metrics) == 0 { | ||
return nil, fmt.Errorf("Best metrics not updated in status") | ||
} | ||
|
||
metric := exp.Status.CurrentOptimalTrial.Observation.Metrics[0] | ||
if metric.Name != exp.Spec.Objective.ObjectiveMetricName { | ||
return nil, fmt.Errorf("Best objective metric not updated in status") | ||
} | ||
return &metric.Value, nil | ||
} | ||
|
||
func main() { | ||
if len(os.Args) != 2 { | ||
log.Fatal("Experiment name is missing") | ||
} | ||
expName := os.Args[1] | ||
b, err := ioutil.ReadFile(expName) | ||
if err != nil { | ||
log.Fatal("Error in reading file ", err) | ||
} | ||
exp := &experimentsv1alpha3.Experiment{} | ||
buf := bytes.NewBufferString(string(b)) | ||
if err = k8syaml.NewYAMLOrJSONDecoder(buf, 1024).Decode(exp); err != nil { | ||
log.Fatal("Yaml decode error ", err) | ||
} | ||
kclient, err := katibclient.NewClient(client.Options{}) | ||
if err != nil { | ||
log.Fatal("NewClient for Katib failed: ", err) | ||
} | ||
exp, err = kclient.GetExperiment(exp.Name, exp.Namespace) | ||
if err != nil { | ||
log.Fatal("Get Experiment error. Experiment not created yet ", err) | ||
} | ||
if exp.Spec.Algorithm.AlgorithmName != "hyperband" { | ||
// Hyperband will validate the parallel trial count, | ||
// thus we should not change it. | ||
var maxtrials int32 = 7 | ||
var paralleltrials int32 = 3 | ||
exp.Spec.MaxTrialCount = &maxtrials | ||
exp.Spec.ParallelTrialCount = ¶lleltrials | ||
} | ||
err = kclient.UpdateExperiment(exp) | ||
if err != nil { | ||
log.Fatal("UpdateExperiment from YAML failed: ", err) | ||
} | ||
endTime := time.Now().Add(timeout) | ||
for time.Now().Before(endTime) { | ||
log.Printf("Waiting for Experiment %s to start running.", exp.Name) | ||
exp, err = kclient.GetExperiment(exp.Name, exp.Namespace) | ||
if err != nil { | ||
log.Fatal("Get Experiment error ", err) | ||
} | ||
if exp.IsRunning() { | ||
log.Printf("Experiment %v started running", exp.Name) | ||
break | ||
} | ||
time.Sleep(5 * time.Second) | ||
} | ||
|
||
for time.Now().Before(endTime) { | ||
exp, err = kclient.GetExperiment(exp.Name, exp.Namespace) | ||
if err != nil { | ||
log.Fatal("Get Experiment error ", err) | ||
} | ||
log.Printf("Waiting for Experiment %s to finish.", exp.Name) | ||
log.Printf(`Experiment %s's trials: %d trials, %d pending trials, | ||
%d running trials, %d killed trials, %d succeeded trials, %d failed trials.`, | ||
exp.Name, | ||
exp.Status.Trials, exp.Status.TrialsPending, exp.Status.TrialsRunning, | ||
exp.Status.TrialsKilled, exp.Status.TrialsSucceeded, exp.Status.TrialsFailed) | ||
log.Printf("Optimal Trial for Experiment %s: %v", exp.Name, | ||
exp.Status.CurrentOptimalTrial) | ||
log.Printf("Experiment %s's conditions: %v", exp.Name, exp.Status.Conditions) | ||
|
||
suggestion, err := kclient.GetSuggestion(exp.Name, exp.Namespace) | ||
if err != nil { | ||
log.Printf("Get Suggestion error: %v", err) | ||
} else { | ||
log.Printf("Suggestion %s's conditions: %v", suggestion.Name, | ||
suggestion.Status.Conditions) | ||
log.Printf("Suggestion %s's suggestions: %v", suggestion.Name, | ||
suggestion.Status.Suggestions) | ||
} | ||
if exp.IsCompleted() { | ||
log.Printf("Experiment %v finished", exp.Name) | ||
break | ||
} | ||
time.Sleep(20 * time.Second) | ||
} | ||
|
||
if !exp.IsCompleted() { | ||
log.Fatal("Experiment run timed out") | ||
} | ||
|
||
metricVal, err := verifyResult(exp) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
if metricVal == nil { | ||
log.Fatal("Metric value in CurrentOptimalTrial not populated") | ||
} | ||
|
||
objectiveType := exp.Spec.Objective.Type | ||
goal := *exp.Spec.Objective.Goal | ||
if (objectiveType == commonv1alpha3.ObjectiveTypeMinimize && *metricVal < goal) || | ||
(objectiveType == commonv1alpha3.ObjectiveTypeMaximize && *metricVal > goal) { | ||
log.Print("Objective Goal reached") | ||
} else { | ||
|
||
if exp.Status.Trials != *exp.Spec.MaxTrialCount { | ||
log.Fatal("All trials are not run in the experiment ", exp.Status.Trials, exp.Spec.MaxTrialCount) | ||
} | ||
|
||
if exp.Status.TrialsSucceeded != *exp.Spec.MaxTrialCount { | ||
log.Fatal("All trials are not successful ", exp.Status.TrialsSucceeded, *exp.Spec.MaxTrialCount) | ||
} | ||
} | ||
log.Printf("Experiment has recorded best current Optimal Trial %v", exp.Status.CurrentOptimalTrial) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we consider the case: instance.Spec.MaxTrialCount == 0 and the experiment will be marked completed with instance.Status.Trials == 0; then update instance.Spec.MaxTrialCount to nil?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think, this is a invalid case. It doesn't make sense to set MaxTrialCount to be zero if set. We should add a validation for this separately.
Related: #768