diff --git a/.github/workflows/test-charmed-katib.yaml b/.github/workflows/test-charmed-katib.yaml index 117531cd4af..a31f18499f3 100644 --- a/.github/workflows/test-charmed-katib.yaml +++ b/.github/workflows/test-charmed-katib.yaml @@ -5,6 +5,25 @@ on: - pull_request jobs: + lint: + name: Lint + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v2 + + - name: Install dependencies + run: | + sudo apt-get install python3-setuptools + sudo pip3 install black flake8 + + - name: Check black + run: black --check operators + + - name: Check flake8 + run: cd operators && flake8 + build: name: Test runs-on: ubuntu-latest @@ -25,7 +44,7 @@ jobs: sudo snap install juju --classic sudo snap install juju-helpers --classic sudo snap install juju-wait --classic - sudo apt update + sudo pip3 install charmcraft - name: Build Docker images run: | @@ -47,21 +66,14 @@ jobs: git clone git://git.launchpad.net/canonical-osm cp -r canonical-osm/charms/interfaces/juju-relation-mysql mysql sg microk8s -c 'juju bootstrap microk8s uk8s' - juju add-model katib + juju add-model kubeflow juju bundle deploy -b bundle-edge.yaml --build juju wait -wvt 300 - name: Test Katib run: | set -eux - kubectl run \ - --rm \ - -i \ - --restart=Never \ - --image=ubuntu \ - katib-check \ - -- \ - bash -c "apt update && apt install -y curl && curl -f http://katib-ui.katib.svc.cluster.local:8080/katib/" + kubectl apply -f examples/v1beta1/random-example.yaml - name: Get pod statuses run: kubectl get all -A @@ -71,14 +83,26 @@ jobs: run: juju status if: failure() - - name: Get katib-controller logs - run: kubectl logs --tail 100 -nkatib -ljuju-app=katib-controller + - name: Get katib-controller workload logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-app=katib-controller + if: failure() + + - name: Get katib-controller operator logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-operator=katib-controller + if: failure() + + - name: Get katib-ui workload logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-app=katib-ui + if: failure() + + - name: Get katib-ui operator logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-operator=katib-ui if: failure() - - name: Get katib-ui logs - run: kubectl logs --tail 100 -nkatib -ljuju-app=katib-ui + - name: Get katib-db-manager workload logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-app=katib-db-manager if: failure() - - name: Get katib-manager logs - run: kubectl logs --tail 100 -nkatib -ljuju-app=katib-manager + - name: Get katib-db-manager operator logs + run: kubectl logs --tail 100 -nkubeflow -ljuju-operator=katib-db-manager if: failure() diff --git a/.gitignore b/.gitignore index c89d79eb2e7..ee2a690312a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ __pycache__/ .coverage .pytest_cache *.egg-info +build/ +*.charm # Project specific ignore files *.swp diff --git a/operators/.flake8 b/operators/.flake8 new file mode 100644 index 00000000000..7da1f9608ee --- /dev/null +++ b/operators/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 100 diff --git a/operators/bundle-edge.yaml b/operators/bundle-edge.yaml index 1b37cbf38bd..e017c748a0c 100644 --- a/operators/bundle-edge.yaml +++ b/operators/bundle-edge.yaml @@ -4,7 +4,7 @@ applications: charm: katib-controller scale: 1 resources: - oci-image: docker.io/kubeflowkatib/katib-controller:latest + oci-image: rocks.canonical.com:5000/kubeflow/katib-controller:edge katib-db: charm: cs:~charmed-osm/mariadb-k8s scale: 1 diff --git a/operators/bundle.yaml b/operators/bundle.yaml index a3be92988c3..5e336fff694 100644 --- a/operators/bundle.yaml +++ b/operators/bundle.yaml @@ -1,8 +1,8 @@ bundle: kubernetes applications: - katib-controller: { charm: katib-controller, scale: 1, annotations: { gui-x: '0', gui-y: '0' } } - katib-db: { charm: cs:~charmed-osm/mariadb-k8s, scale: 1, annotations: { gui-x: '0', gui-y: '300' }, options: { database: katib } } - katib-db-manager: { charm: katib-db-manager, scale: 1, annotations: { gui-x: '300', gui-y: '0' } } - katib-ui: { charm: katib-ui, scale: 1, annotations: { gui-x: '300', gui-y: '300' } } + katib-controller: { charm: katib-controller, scale: 1 } + katib-db: { charm: cs:~charmed-osm/mariadb-k8s, scale: 1, options: { database: katib } } + katib-db-manager: { charm: katib-db-manager, scale: 1 } + katib-ui: { charm: katib-ui, scale: 1 } relations: - [katib-db-manager, katib-db] diff --git a/operators/katib-controller/layer.yaml b/operators/katib-controller/layer.yaml deleted file mode 100644 index 36be00576d9..00000000000 --- a/operators/katib-controller/layer.yaml +++ /dev/null @@ -1,6 +0,0 @@ -repo: https://github.com/juju-solutions/bundle-kubeflow.git -includes: - - "layer:caas-base" - - "layer:status" - - "layer:docker-resource" - - "interface:http" diff --git a/operators/katib-controller/metadata.yaml b/operators/katib-controller/metadata.yaml index d2072edfee2..28a0e85d250 100755 --- a/operators/katib-controller/metadata.yaml +++ b/operators/katib-controller/metadata.yaml @@ -8,14 +8,16 @@ description: | hyperparameters of applications written in any language of the users’ choice and natively supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] -maintainers: [Kenneth Koski ] +maintainers: + - Dominik Fleischmann + - Kenneth Koski series: [kubernetes] resources: oci-image: type: oci-image description: Backing OCI image auto-fetch: true - upstream-source: docker.io/kubeflowkatib/katib-controller:v1beta1-a96ff59 + upstream-source: rocks.canonical.com:5000/kubeflow/katib-controller:edge provides: katib-controller: interface: http diff --git a/operators/katib-controller/reactive/katib_controller.py b/operators/katib-controller/reactive/katib_controller.py deleted file mode 100644 index 6a856e6476d..00000000000 --- a/operators/katib-controller/reactive/katib_controller.py +++ /dev/null @@ -1,393 +0,0 @@ -import json -import os -from base64 import b64encode -from pathlib import Path -from subprocess import check_call - -import yaml - -from charmhelpers.core import hookenv -from charms import layer -from charms.reactive import clear_flag, hook, set_flag, when, when_not - - -@hook("upgrade-charm") -def upgrade_charm(): - clear_flag("charm.started") - - -@when("charm.started") -def charm_ready(): - layer.status.active("") - - -@when("layer.docker-resource.oci-image.changed") -def update_image(): - clear_flag("charm.started") - - -def gen_certs(namespace, service_name): - if Path("/run/cert.pem").exists(): - hookenv.log("Found existing cert.pem, not generating new cert.") - return - - Path("/run/ssl.conf").write_text( - f"""[ req ] -default_bits = 2048 -prompt = no -default_md = sha256 -req_extensions = req_ext -distinguished_name = dn -[ dn ] -C = GB -ST = Canonical -L = Canonical -O = Canonical -OU = Canonical -CN = 127.0.0.1 -[ req_ext ] -subjectAltName = @alt_names -[ alt_names ] -DNS.1 = {service_name} -DNS.2 = {service_name}.{namespace} -DNS.3 = {service_name}.{namespace}.svc -DNS.4 = {service_name}.{namespace}.svc.cluster -DNS.5 = {service_name}.{namespace}.svc.cluster.local -IP.1 = 127.0.0.1 -[ v3_ext ] -authorityKeyIdentifier=keyid,issuer:always -basicConstraints=CA:FALSE -keyUsage=keyEncipherment,dataEncipherment,digitalSignature -extendedKeyUsage=serverAuth,clientAuth -subjectAltName=@alt_names""" - ) - - check_call(["openssl", "genrsa", "-out", "/run/ca.key", "2048"]) - check_call(["openssl", "genrsa", "-out", "/run/server.key", "2048"]) - check_call( - [ - "openssl", - "req", - "-x509", - "-new", - "-sha256", - "-nodes", - "-days", - "3650", - "-key", - "/run/ca.key", - "-subj", - "/CN=127.0.0.1", - "-out", - "/run/ca.crt", - ] - ) - check_call( - [ - "openssl", - "req", - "-new", - "-sha256", - "-key", - "/run/server.key", - "-out", - "/run/server.csr", - "-config", - "/run/ssl.conf", - ] - ) - check_call( - [ - "openssl", - "x509", - "-req", - "-sha256", - "-in", - "/run/server.csr", - "-CA", - "/run/ca.crt", - "-CAkey", - "/run/ca.key", - "-CAcreateserial", - "-out", - "/run/cert.pem", - "-days", - "365", - "-extensions", - "v3_ext", - "-extfile", - "/run/ssl.conf", - ] - ) - - -@when("layer.docker-resource.oci-image.available") -@when_not("charm.started") -def start_charm(): - if not hookenv.is_leader(): - hookenv.log("This unit is not a leader.") - return False - - layer.status.maintenance("configuring container") - - image_info = layer.docker_resource.get_info("oci-image") - namespace = os.environ["JUJU_MODEL_NAME"] - config = dict(hookenv.config()) - - gen_certs(namespace, hookenv.service_name()) - ca_bundle = b64encode(Path("/run/cert.pem").read_bytes()).decode("utf-8") - - layer.caas_base.pod_spec_set( - { - "version": 3, - "serviceAccount": { - "roles": [ - { - "global": True, - "rules": [ - { - "apiGroups": [""], - "resources": [ - "configmaps", - "serviceaccounts", - "services", - "secrets", - "events", - "namespaces", - "persistentvolumes", - "persistentvolumeclaims", - ], - "verbs": ["*"], - }, - { - "apiGroups": [""], - "resources": [ - "pods", - "pods/log", - "pods/status", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["apps"], - "resources": ["deployments"], - "verbs": ["*"], - }, - { - "apiGroups": ["batch"], - "resources": ["jobs", "cronjobs"], - "verbs": ["*"], - }, - { - "apiGroups": ["apiextensions.k8s.io"], - "resources": ["customresourcedefinitions"], - "verbs": ["create", "get"], - }, - { - "apiGroups": ["admissionregistration.k8s.io"], - "resources": [ - "validatingwebhookconfigurations", - "mutatingwebhookconfigurations", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["kubeflow.org"], - "resources": [ - "experiments", - "experiments/status", - "experiments/finalizers", - "trials", - "trials/status", - "trials/finalizers", - "suggestions", - "suggestions/status", - "suggestions/finalizers", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["kubeflow.org"], - "resources": [ - "tfjobs", - "pytorchjobs", - "mpijobs", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["tekton.dev"], - "resources": [ - "pipelineruns", - "taskruns", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["rbac.authorization.k8s.io"], - "resources": [ - "roles", - "rolebindings", - ], - "verbs": ["*"], - }, - ] - } - ] - }, - "containers": [ - { - "name": "katib-controller", - "command": ["./katib-controller"], - "args": [ - "--webhook-port", - str(config["webhook-port"]), - "--trial-resources=Job.v1.batch", - "--trial-resources=TFJob.v1.kubeflow.org", - "--trial-resources=PyTorchJob.v1.kubeflow.org", - "--trial-resources=MPIJob.v1.kubeflow.org", - "--trial-resources=PipelineRun.v1beta1.tekton.dev", - ], - "imageDetails": { - "imagePath": image_info.registry_path, - "username": image_info.username, - "password": image_info.password, - }, - "ports": [ - {"name": "webhook", "containerPort": config["webhook-port"]}, - {"name": "metrics", "containerPort": config["metrics-port"]}, - ], - "envConfig": { - "KATIB_CORE_NAMESPACE": os.environ["JUJU_MODEL_NAME"] - }, - "volumeConfig": [ - { - "name": "cert", - "mountPath": "/tmp/cert", - "files": [ - { - "path": "cert.pem", - "content": Path("/run/cert.pem").read_text(), - }, - { - "path": "key.pem", - "content": Path("/run/server.key").read_text(), - }, - ], - } - ], - "kubernetes": {"securityContext": {"runAsUser": 0}}, - } - ], - }, - k8s_resources={ - "kubernetesResources": { - "customResourceDefinitions": [ - {"name": crd["metadata"]["name"], "spec": crd["spec"]} - for crd in yaml.safe_load_all(Path("files/crds.yaml").read_text()) - ], - "mutatingWebhookConfigurations": [ - { - "name": "katib-mutating-webhook-config", - "webhooks": [ - { - "name": "mutating.experiment.katib.kubeflow.org", - "rules": [ - { - "apiGroups": ["kubeflow.org"], - "apiVersions": ["v1beta1"], - "operations": ["CREATE", "UPDATE"], - "resources": ["experiments"], - "scope": "*", - } - ], - "failurePolicy": "Fail", - "clientConfig": { - "service": { - "name": hookenv.service_name(), - "namespace": namespace, - "path": "/mutate-experiments", - "port": config["webhook-port"], - }, - "caBundle": ca_bundle, - }, - }, - { - "name": "mutating.pod.katib.kubeflow.org", - "rules": [ - { - "apiGroups": [""], - "apiVersions": ["v1"], - "operations": ["CREATE"], - "resources": ["pods"], - "scope": "*", - } - ], - "failurePolicy": "Ignore", - "clientConfig": { - "service": { - "name": hookenv.service_name(), - "namespace": namespace, - "path": "/mutate-pods", - "port": config["webhook-port"], - }, - "caBundle": ca_bundle, - }, - }, - ], - } - ], - "validatingWebhookConfigurations": [ - { - "name": "katib-validating-webhook-config", - "webhooks": [ - { - "name": "validating.experiment.katib.kubeflow.org", - "rules": [ - { - "apiGroups": ["kubeflow.org"], - "apiVersions": ["v1beta1"], - "operations": ["CREATE", "UPDATE"], - "resources": ["experiments"], - "scope": "*", - } - ], - "failurePolicy": "Fail", - "sideEffects": "Unknown", - "clientConfig": { - "service": { - "name": hookenv.service_name(), - "namespace": namespace, - "path": "/validate-experiments", - "port": config["webhook-port"], - }, - "caBundle": ca_bundle, - }, - } - ], - } - ], - }, - "configMaps": { - "katib-config": { - f: Path(f"files/{f}.json").read_text() - for f in ( - "metrics-collector-sidecar", - "suggestion", - "early-stopping", - ) - }, - "trial-template": { - f + suffix: Path(f"files/{f}.yaml").read_text() - for f, suffix in ( - ("defaultTrialTemplate", ".yaml"), - ("enasCPUTemplate", ""), - ("pytorchJobTemplate", ""), - ) - }, - }, - }, - ) - - layer.status.maintenance("creating container") - set_flag("charm.started") diff --git a/operators/katib-controller/requirements.txt b/operators/katib-controller/requirements.txt new file mode 100644 index 00000000000..32d2dd68b76 --- /dev/null +++ b/operators/katib-controller/requirements.txt @@ -0,0 +1,2 @@ +ops==1.0.1 +git+git://github.com/juju-solutions/resource-oci-image.git#egg=oci_image diff --git a/operators/katib-controller/src/charm.py b/operators/katib-controller/src/charm.py new file mode 100755 index 00000000000..cb0944b200c --- /dev/null +++ b/operators/katib-controller/src/charm.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path +from subprocess import check_call + +import yaml +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus, MaintenanceStatus +from ops.framework import StoredState + +from oci_image import OCIImageResource, OCIImageResourceError + +logger = logging.getLogger(__name__) + + +class Operator(CharmBase): + """Deploys the katib-controller service.""" + + _stored = StoredState() + + def __init__(self, framework): + super().__init__(framework) + + if not self.model.unit.is_leader(): + logger.info("Not a leader, skipping any work") + self.model.unit.status = ActiveStatus() + return + + self._stored.set_default(**self.gen_certs()) + self.image = OCIImageResource(self, "oci-image") + self.framework.observe(self.on.install, self.set_pod_spec) + self.framework.observe(self.on.upgrade_charm, self.set_pod_spec) + + def set_pod_spec(self, event): + self.model.unit.status = MaintenanceStatus("Setting pod spec") + + try: + image_details = self.image.fetch() + except OCIImageResourceError as e: + self.model.unit.status = e.status + return + + validating, mutating = yaml.safe_load_all(Path("src/webhooks.yaml").read_text()) + + self.model.pod.set_spec( + { + "version": 3, + "serviceAccount": { + "roles": [ + { + "global": True, + "rules": [ + { + "apiGroups": [""], + "resources": [ + "configmaps", + "serviceaccounts", + "services", + "events", + "namespaces", + "persistentvolumes", + "persistentvolumeclaims", + ], + "verbs": ["*"], + }, + { + "apiGroups": [""], + "resources": ["pods", "pods/log", "pods/status"], + "verbs": ["*"], + }, + { + "apiGroups": ["apps"], + "resources": ["deployments"], + "verbs": ["*"], + }, + { + "apiGroups": ["batch"], + "resources": ["jobs", "cronjobs"], + "verbs": ["*"], + }, + { + "apiGroups": ["apiextensions.k8s.io"], + "resources": ["customresourcedefinitions"], + "verbs": ["create", "get"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "experiments", + "experiments/status", + "experiments/finalizers", + "trials", + "trials/status", + "trials/finalizers", + "suggestions", + "suggestions/status", + "suggestions/finalizers", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": ["tfjobs", "pytorchjobs", "mpijobs"], + "verbs": ["*"], + }, + { + "apiGroups": ["tekton.dev"], + "resources": ["pipelineruns", "taskruns"], + "verbs": ["*"], + }, + { + "apiGroups": ["rbac.authorization.k8s.io"], + "resources": ["roles", "rolebindings"], + "verbs": ["*"], + }, + ], + } + ], + }, + "containers": [ + { + "name": "katib-controller", + "imageDetails": image_details, + "command": ["./katib-controller"], + "args": [ + f"--webhook-port={self.model.config['webhook-port']}", + "--trial-resources=Job.v1.batch", + "--trial-resources=TFJob.v1.kubeflow.org", + "--trial-resources=PyTorchJob.v1.kubeflow.org", + "--trial-resources=MPIJob.v1.kubeflow.org", + "--trial-resources=PipelineRun.v1beta1.tekton.dev", + ], + "ports": [ + { + "name": "webhook", + "containerPort": self.model.config["webhook-port"], + }, + { + "name": "metrics", + "containerPort": self.model.config["metrics-port"], + }, + ], + "envConfig": { + "KATIB_CORE_NAMESPACE": self.model.name, + }, + "volumeConfig": [ + { + "name": "certs", + "mountPath": "/tmp/cert", + "files": [ + { + "path": "tls.crt", + "content": self._stored.cert, + }, + { + "path": "tls.key", + "content": self._stored.key, + }, + ], + } + ], + } + ], + }, + k8s_resources={ + "kubernetesResources": { + "customResourceDefinitions": [ + {"name": crd["metadata"]["name"], "spec": crd["spec"]} + for crd in yaml.safe_load_all(Path("src/crds.yaml").read_text()) + ], + "mutatingWebhookConfigurations": [ + { + "name": mutating["metadata"]["name"], + "webhooks": mutating["webhooks"], + } + ], + "validatingWebhookConfigurations": [ + { + "name": validating["metadata"]["name"], + "webhooks": validating["webhooks"], + } + ], + }, + "configMaps": { + "katib-config": { + f: Path(f"src/{f}.json").read_text() + for f in ( + "metrics-collector-sidecar", + "suggestion", + "early-stopping", + ) + }, + "trial-template": { + f + suffix: Path(f"src/{f}.yaml").read_text() + for f, suffix in ( + ("defaultTrialTemplate", ".yaml"), + ("enasCPUTemplate", ""), + ("pytorchJobTemplate", ""), + ) + }, + }, + }, + ) + + self.model.unit.status = ActiveStatus() + + def gen_certs(self): + model = self.model.name + app = self.model.app.name + Path("/run/ssl.conf").write_text( + f"""[ req ] +default_bits = 2048 +prompt = no +default_md = sha256 +req_extensions = req_ext +distinguished_name = dn +[ dn ] +C = GB +ST = Canonical +L = Canonical +O = Canonical +OU = Canonical +CN = 127.0.0.1 +[ req_ext ] +subjectAltName = @alt_names +[ alt_names ] +DNS.1 = {app} +DNS.2 = {app}.{model} +DNS.3 = {app}.{model}.svc +DNS.4 = {app}.{model}.svc.cluster +DNS.5 = {app}.{model}.svc.cluster.local +IP.1 = 127.0.0.1 +[ v3_ext ] +authorityKeyIdentifier=keyid,issuer:always +basicConstraints=CA:FALSE +keyUsage=keyEncipherment,dataEncipherment,digitalSignature +extendedKeyUsage=serverAuth,clientAuth +subjectAltName=@alt_names""" + ) + + check_call(["openssl", "genrsa", "-out", "/run/ca.key", "2048"]) + check_call(["openssl", "genrsa", "-out", "/run/server.key", "2048"]) + check_call( + [ + "openssl", + "req", + "-x509", + "-new", + "-sha256", + "-nodes", + "-days", + "3650", + "-key", + "/run/ca.key", + "-subj", + "/CN=127.0.0.1", + "-out", + "/run/ca.crt", + ] + ) + check_call( + [ + "openssl", + "req", + "-new", + "-sha256", + "-key", + "/run/server.key", + "-out", + "/run/server.csr", + "-config", + "/run/ssl.conf", + ] + ) + check_call( + [ + "openssl", + "x509", + "-req", + "-sha256", + "-in", + "/run/server.csr", + "-CA", + "/run/ca.crt", + "-CAkey", + "/run/ca.key", + "-CAcreateserial", + "-out", + "/run/cert.pem", + "-days", + "365", + "-extensions", + "v3_ext", + "-extfile", + "/run/ssl.conf", + ] + ) + + return { + "cert": Path("/run/cert.pem").read_text(), + "key": Path("/run/server.key").read_text(), + "ca": Path("/run/ca.crt").read_text(), + } + + +if __name__ == "__main__": + main(Operator) diff --git a/operators/katib-controller/files/crds.yaml b/operators/katib-controller/src/crds.yaml similarity index 100% rename from operators/katib-controller/files/crds.yaml rename to operators/katib-controller/src/crds.yaml diff --git a/operators/katib-controller/files/defaultTrialTemplate.yaml b/operators/katib-controller/src/defaultTrialTemplate.yaml similarity index 100% rename from operators/katib-controller/files/defaultTrialTemplate.yaml rename to operators/katib-controller/src/defaultTrialTemplate.yaml diff --git a/operators/katib-controller/files/early-stopping.json b/operators/katib-controller/src/early-stopping.json similarity index 100% rename from operators/katib-controller/files/early-stopping.json rename to operators/katib-controller/src/early-stopping.json diff --git a/operators/katib-controller/files/enasCPUTemplate.yaml b/operators/katib-controller/src/enasCPUTemplate.yaml similarity index 100% rename from operators/katib-controller/files/enasCPUTemplate.yaml rename to operators/katib-controller/src/enasCPUTemplate.yaml diff --git a/operators/katib-controller/files/metrics-collector-sidecar.json b/operators/katib-controller/src/metrics-collector-sidecar.json similarity index 100% rename from operators/katib-controller/files/metrics-collector-sidecar.json rename to operators/katib-controller/src/metrics-collector-sidecar.json diff --git a/operators/katib-controller/files/pytorchJobTemplate.yaml b/operators/katib-controller/src/pytorchJobTemplate.yaml similarity index 100% rename from operators/katib-controller/files/pytorchJobTemplate.yaml rename to operators/katib-controller/src/pytorchJobTemplate.yaml diff --git a/operators/katib-controller/files/suggestion.json b/operators/katib-controller/src/suggestion.json similarity index 100% rename from operators/katib-controller/files/suggestion.json rename to operators/katib-controller/src/suggestion.json diff --git a/operators/katib-controller/src/webhooks.yaml b/operators/katib-controller/src/webhooks.yaml new file mode 100644 index 00000000000..868aee1746d --- /dev/null +++ b/operators/katib-controller/src/webhooks.yaml @@ -0,0 +1,77 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: katib.kubeflow.org +webhooks: + - name: validator.experiment.katib.kubeflow.org + sideEffects: None + failurePolicy: Ignore + # TODO (andreyvelich): Migrate to v1 ? + admissionReviewVersions: + - v1beta1 + clientConfig: + caBundle: Cg== + service: + name: katib-controller + namespace: kubeflow + path: /validate-experiment + rules: + - apiGroups: + - kubeflow.org + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - experiments +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: katib.kubeflow.org +webhooks: + - name: defaulter.experiment.katib.kubeflow.org + sideEffects: None + failurePolicy: Ignore + admissionReviewVersions: + - v1beta1 + clientConfig: + caBundle: Cg== + service: + name: katib-controller + namespace: kubeflow + path: /mutate-experiment + rules: + - apiGroups: + - kubeflow.org + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - experiments + - name: mutator.pod.katib.kubeflow.org + sideEffects: None + failurePolicy: Ignore + admissionReviewVersions: + - v1beta1 + clientConfig: + caBundle: Cg== + service: + name: katib-controller + namespace: kubeflow + path: /mutate-pod + namespaceSelector: + matchLabels: + katib-metricscollector-injection: enabled + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods diff --git a/operators/katib-db-manager/layer.yaml b/operators/katib-db-manager/layer.yaml deleted file mode 100644 index e1f47d59ed2..00000000000 --- a/operators/katib-db-manager/layer.yaml +++ /dev/null @@ -1,7 +0,0 @@ -repo: https://github.com/juju-solutions/bundle-kubeflow.git -includes: - - "layer:caas-base" - - "layer:status" - - "layer:docker-resource" - - "interface:mysql" - - "interface:http" diff --git a/operators/katib-db-manager/metadata.yaml b/operators/katib-db-manager/metadata.yaml index 767b462d1d6..a90fa0a987b 100755 --- a/operators/katib-db-manager/metadata.yaml +++ b/operators/katib-db-manager/metadata.yaml @@ -8,7 +8,9 @@ description: | hyperparameters of applications written in any language of the users’ choice and natively supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] -maintainers: [Kenneth Koski ] +maintainers: + - Dominik Fleischmann + - Kenneth Koski series: [kubernetes] resources: oci-image: diff --git a/operators/katib-db-manager/reactive/katib_db_manager.py b/operators/katib-db-manager/reactive/katib_db_manager.py deleted file mode 100644 index deff7e47ac8..00000000000 --- a/operators/katib-db-manager/reactive/katib_db_manager.py +++ /dev/null @@ -1,88 +0,0 @@ -from charmhelpers.core import hookenv -from charms import layer -from charms.reactive import ( - hook, - set_flag, - clear_flag, - when, - when_any, - when_not, - endpoint_from_name, -) - - -@hook("upgrade-charm") -def upgrade_charm(): - clear_flag("charm.started") - - -@when("charm.started") -def charm_ready(): - layer.status.active("") - - -@when_any("layer.docker-resource.oci-image.changed", "config.changed", "mysql.changed") -def update_image(): - clear_flag("charm.started") - - -@when("layer.docker-resource.oci-image.available", "mysql.available") -@when_not("charm.started") -def start_charm(): - if not hookenv.is_leader(): - hookenv.log("This unit is not a leader.") - return False - - layer.status.maintenance("configuring container") - - image_info = layer.docker_resource.get_info("oci-image") - - mysql = endpoint_from_name("mysql") - - port = hookenv.config("port") - - layer.caas_base.pod_spec_set( - { - "version": 3, - "containers": [ - { - "name": "katib-db-manager", - "command": ["./katib-db-manager"], - "imageDetails": { - "imagePath": image_info.registry_path, - "username": image_info.username, - "password": image_info.password, - }, - "ports": [{"name": "api", "containerPort": port}], - "envConfig": { - "DB_NAME": "mysql", - "DB_USER": "root", - "DB_PASSWORD": mysql.root_password(), - "KATIB_MYSQL_DB_HOST": mysql.host(), - "KATIB_MYSQL_DB_PORT": mysql.port(), - "KATIB_MYSQL_DB_DATABASE": "katib", - }, - "kubernetes": { - "readinessProbe": { - "exec": { - "command": ["/bin/grpc_health_probe", f"-addr=:{port}"] - }, - "initialDelaySeconds": 5, - }, - "livenessProbe": { - "exec": { - "command": ["/bin/grpc_health_probe", f"-addr=:{port}"] - }, - "initialDelaySeconds": 10, - "periodSeconds": 60, - "failureThreshold": 5, - }, - }, - } - ], - }, - ) - - layer.status.maintenance("creating container") - clear_flag("mysql.changed") - set_flag("charm.started") diff --git a/operators/katib-db-manager/requirements.txt b/operators/katib-db-manager/requirements.txt new file mode 100644 index 00000000000..32d2dd68b76 --- /dev/null +++ b/operators/katib-db-manager/requirements.txt @@ -0,0 +1,2 @@ +ops==1.0.1 +git+git://github.com/juju-solutions/resource-oci-image.git#egg=oci_image diff --git a/operators/katib-db-manager/src/charm.py b/operators/katib-db-manager/src/charm.py new file mode 100755 index 00000000000..1771d52fb7f --- /dev/null +++ b/operators/katib-db-manager/src/charm.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +import logging + +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus + +from oci_image import OCIImageResource, OCIImageResourceError + +logger = logging.getLogger(__name__) + + +class Operator(CharmBase): + """Deploys the katib-db-manager service.""" + + def __init__(self, framework): + super().__init__(framework) + + if not self.model.unit.is_leader(): + logger.info("Not a leader, skipping any work") + self.model.unit.status = ActiveStatus() + return + + self.image = OCIImageResource(self, "oci-image") + self.framework.observe(self.on.install, self.set_pod_spec) + self.framework.observe(self.on.config_changed, self.set_pod_spec) + self.framework.observe(self.on.upgrade_charm, self.set_pod_spec) + self.framework.observe(self.on["mysql"].relation_joined, self.set_pod_spec) + + def set_pod_spec(self, event): + self.model.unit.status = MaintenanceStatus("Setting pod spec") + + try: + image_details = self.image.fetch() + except OCIImageResourceError as e: + self.model.unit.status = e.status + return + + try: + relation = self.model.relations["mysql"][0] + except IndexError: + self.model.unit.status = WaitingStatus("Waiting for mysql relation") + return + + unit = next(iter(relation.units)) + data = relation.data[unit] + + self.model.pod.set_spec( + { + "version": 3, + "serviceAccount": { + "roles": [ + { + "global": True, + "rules": [ + { + "apiGroups": [""], + "resources": [ + "configmaps", + "namespaces", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "experiments", + "trials", + "suggestions", + ], + "verbs": ["*"], + }, + ], + } + ] + }, + "containers": [ + { + "name": "katib-db-manager", + "command": ["./katib-db-manager"], + "imageDetails": image_details, + "ports": [ + { + "name": "api", + "containerPort": self.model.config["port"], + } + ], + "envConfig": { + "DB_NAME": "mysql", + "DB_USER": "root", + "DB_PASSWORD": data["root_password"], + "KATIB_MYSQL_DB_HOST": data["host"], + "KATIB_MYSQL_DB_PORT": data["port"], + "KATIB_MYSQL_DB_DATABASE": data["database"], + }, + "kubernetes": { + "readinessProbe": { + "exec": { + "command": [ + "/bin/grpc_health_probe", + f"-addr=:{self.model.config['port']}", + ] + }, + "initialDelaySeconds": 5, + }, + "livenessProbe": { + "exec": { + "command": [ + "/bin/grpc_health_probe", + f"-addr=:{self.model.config['port']}", + ] + }, + "initialDelaySeconds": 10, + "periodSeconds": 60, + "failureThreshold": 5, + }, + }, + } + ], + }, + ) + + self.model.unit.status = ActiveStatus() + + +if __name__ == "__main__": + main(Operator) diff --git a/operators/katib-ui/layer.yaml b/operators/katib-ui/layer.yaml deleted file mode 100644 index 1095b71ff1e..00000000000 --- a/operators/katib-ui/layer.yaml +++ /dev/null @@ -1,6 +0,0 @@ -repo: https://github.com/juju-solutions/bundle-kubeflow.git -includes: - - "layer:caas-base" - - "layer:status" - - "layer:docker-resource" - - "interface:service-mesh" diff --git a/operators/katib-ui/metadata.yaml b/operators/katib-ui/metadata.yaml index 005a8f6a586..dea4b952961 100755 --- a/operators/katib-ui/metadata.yaml +++ b/operators/katib-ui/metadata.yaml @@ -8,7 +8,9 @@ description: | hyperparameters of applications written in any language of the users’ choice and natively supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] -maintainers: [Kenneth Koski ] +maintainers: + - Dominik Fleischmann + - Kenneth Koski series: [kubernetes] resources: oci-image: diff --git a/operators/katib-ui/reactive/katib_ui.py b/operators/katib-ui/reactive/katib_ui.py deleted file mode 100644 index f3a12e66a08..00000000000 --- a/operators/katib-ui/reactive/katib_ui.py +++ /dev/null @@ -1,95 +0,0 @@ -import os - -from charmhelpers.core import hookenv -from charms import layer -from charms.reactive import ( - clear_flag, - endpoint_from_name, - hook, - set_flag, - when, - when_not, -) - - -@hook("upgrade-charm") -def upgrade_charm(): - clear_flag("charm.started") - - -@when("charm.started") -def charm_ready(): - layer.status.active("") - - -@when("layer.docker-resource.oci-image.changed") -def update_image(): - clear_flag("charm.started") - - -@when("endpoint.service-mesh.joined") -def configure_mesh(): - endpoint_from_name("service-mesh").add_route( - prefix="/katib/", service=hookenv.service_name(), port=hookenv.config("port") - ) - - -@when("layer.docker-resource.oci-image.available") -@when_not("charm.started") -def start_charm(): - if not hookenv.is_leader(): - hookenv.log("This unit is not a leader.") - return False - - layer.status.maintenance("configuring container") - - image_info = layer.docker_resource.get_info("oci-image") - - port = hookenv.config("port") - - layer.caas_base.pod_spec_set( - { - "version": 2, - "serviceAccount": { - "global": True, - "rules": [ - { - "apiGroups": [""], - "resources": [ - "configmaps", - "namespaces", - ], - "verbs": ["*"], - }, - { - "apiGroups": ["kubeflow.org"], - "resources": [ - "experiments", - "trials", - "suggestions", - ], - "verbs": ["*"], - }, - ] - }, - "containers": [ - { - "name": "katib-ui", - "command": ["./katib-ui"], - "args": [f"--port={port}"], - "imageDetails": { - "imagePath": image_info.registry_path, - "username": image_info.username, - "password": image_info.password, - }, - "ports": [{"name": "http", "containerPort": port}], - "config": { - "KATIB_CORE_NAMESPACE": os.environ["JUJU_MODEL_NAME"], - }, - } - ], - } - ) - - layer.status.maintenance("creating container") - set_flag("charm.started") diff --git a/operators/katib-ui/requirements.txt b/operators/katib-ui/requirements.txt new file mode 100644 index 00000000000..32d2dd68b76 --- /dev/null +++ b/operators/katib-ui/requirements.txt @@ -0,0 +1,2 @@ +ops==1.0.1 +git+git://github.com/juju-solutions/resource-oci-image.git#egg=oci_image diff --git a/operators/katib-ui/src/charm.py b/operators/katib-ui/src/charm.py new file mode 100755 index 00000000000..67fd8edc173 --- /dev/null +++ b/operators/katib-ui/src/charm.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +import logging + +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus, MaintenanceStatus + +from oci_image import OCIImageResource, OCIImageResourceError + +logger = logging.getLogger(__name__) + + +class Operator(CharmBase): + """Deploys the katib-ui service.""" + + def __init__(self, framework): + super().__init__(framework) + + if not self.model.unit.is_leader(): + logger.info("Not a leader, skipping any work") + self.model.unit.status = ActiveStatus() + return + + self.image = OCIImageResource(self, "oci-image") + self.framework.observe(self.on.install, self.set_pod_spec) + self.framework.observe(self.on.upgrade_charm, self.set_pod_spec) + + def set_pod_spec(self, event): + self.model.unit.status = MaintenanceStatus("Setting pod spec") + + try: + image_details = self.image.fetch() + except OCIImageResourceError as e: + self.model.unit.status = e.status + return + + self.model.pod.set_spec( + { + "version": 3, + "serviceAccount": { + "roles": [ + { + "global": True, + "rules": [ + { + "apiGroups": [""], + "resources": [ + "configmaps", + "namespaces", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "experiments", + "trials", + "suggestions", + ], + "verbs": ["*"], + }, + ], + } + ] + }, + "containers": [ + { + "name": "katib-ui", + "command": ["./katib-ui"], + "args": [f"--port={self.model.config['port']}"], + "imageDetails": image_details, + "ports": [ + { + "name": "http", + "containerPort": self.model.config["port"], + } + ], + "envConfig": { + "KATIB_CORE_NAMESPACE": self.model.name, + }, + } + ], + }, + ) + + self.model.unit.status = ActiveStatus() + + +if __name__ == "__main__": + main(Operator)