From ff29b0916f4ba6f378b4db380676caa09897c3fd Mon Sep 17 00:00:00 2001 From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> Date: Thu, 5 Dec 2024 18:14:00 +0000 Subject: [PATCH 01/18] CASMINST-6711 - Command Mismatch When Setting up SNMP on Dell and Mellanox switches (#5574) * CASMINST-6711 - Command Mismatch When Setting up SNMP on Dell and Mellanox switches * Apply suggestions from code review Co-authored-by: Russell Bunch Signed-off-by: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> --------- Signed-off-by: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> Co-authored-by: Russell Bunch --- .../management_network/dell/snmp-community.md | 54 +++++++++++++++---- .../mellanox/snmpv3_users.md | 2 +- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/operations/network/management_network/dell/snmp-community.md b/operations/network/management_network/dell/snmp-community.md index a35d13531e24..dde4764217fe 100644 --- a/operations/network/management_network/dell/snmp-community.md +++ b/operations/network/management_network/dell/snmp-community.md @@ -1,25 +1,59 @@ -# Configure SNMPv2c Community +# Configure SNMPv2c community -The switch supports SNMPv2c community-based security for read-only access. +The switch supports SNMPv2c community-based security for read-only and read-write access. -## Configuration Commands +## Configuration commands -Configure an SNMPv2c community name: +### Configure the SNMP community + +1. Enter configuration mode. + + ```console + configure terminal + ``` + +1. Configure the SNMPv2c community name + + ```console + snmp-server community community-name access-mode + ``` + + Parameters: + + | Parameter | Description | + |------------------|----------------------------------------------------------------------------------------------| + | `community-name` | The user defined name for this community. | + | `access-mode` | The access level for this community. Can be `ro` for read-only or `rw` for read-write access | + +### Example + +The following command configures a read-only SNMP community called "public". ```text -snmp-server community community-name +snmp-server community public ro ``` -Show commands to validate functionality: +When successful this command returns no output. -```text +### Show configured SNMP community + +The following command displays information about any SNMP community that may have been configured. + +```console show snmp community ``` +Example output: + +```text +Community : public +Access : read-only +``` + ## Expected Results -1. Administrators can configure the community name -2. Administrators can bind the SNMP server to the default VRF -3. Administrators can connect from the workstation using the community name +1. Administrators can configure the community name. +2. Administrators can bind the SNMP server to the default VRF. +3. Administrators can connect from the workstation using the community name. [Back to Index](../README.md) diff --git a/operations/network/management_network/mellanox/snmpv3_users.md b/operations/network/management_network/mellanox/snmpv3_users.md index 99db7e4cc542..ce879b1f9581 100644 --- a/operations/network/management_network/mellanox/snmpv3_users.md +++ b/operations/network/management_network/mellanox/snmpv3_users.md @@ -17,7 +17,7 @@ switch(config)# snmp-server user testuser v3 require-privacy Show Commands to Validate Functionality ```console -show snmp users +show snmp user ``` [Back to Index](../README.md) From 036ce1d848f33c472a3b14cbf3f83b07b624c900 Mon Sep 17 00:00:00 2001 From: Bo Quan <36549272+bo-quan@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:37:35 -0500 Subject: [PATCH 02/18] CASMTRIAGE-7545: handle idempotent annotation addition (#5586) --- upgrade/scripts/upgrade/util/update-customizations.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh index 7c779fea779d..fc96a8963a58 100755 --- a/upgrade/scripts/upgrade/util/update-customizations.sh +++ b/upgrade/scripts/upgrade/util/update-customizations.sh @@ -98,7 +98,9 @@ yq4 eval '.spec.kubernetes.services.cray-vault.ingress.host = "vault.cmn.{{ netw # cray-istio yq w -i "$c" 'spec.kubernetes.services.cray-istio.services.istio-ingressgateway-hmn.serviceAnnotations.[external-dns.alpha.kubernetes.io/hostname]' 'api.hmnlb.{{ network.dns.external }},auth.hmnlb.{{ network.dns.external }},hmcollector.hmnlb.{{ network.dns.external }}' yq w -i "$c" 'spec.kubernetes.services.cray-istio.certificate.dnsNames[+]' 'istio-ingressgateway-cmn.istio-system.svc.cluster.local' -yq4 eval '.spec.kubernetes.services.cray-istio.services.istio-ingressgateway-cmn.serviceAnnotations."external-dns.alpha.kubernetes.io/hostname" += ",vault.cmn.{{ network.dns.external }}"' -i "$c" +if [[ -z "$(yq4 eval '.spec.kubernetes.services.cray-istio.services.istio-ingressgateway-cmn.serviceAnnotations."external-dns.alpha.kubernetes.io/hostname" | select(. == "*vault.cmn*")' $c)" ]]; then + yq4 eval '.spec.kubernetes.services.cray-istio.services.istio-ingressgateway-cmn.serviceAnnotations."external-dns.alpha.kubernetes.io/hostname" += ",vault.cmn.{{ network.dns.external }}"' -i "$c" +fi # cray-keycloak if [[ -n "$(yq r "$c" "spec.kubernetes.services.cray-keycloak.keycloak.keycloak")" ]]; then From f4eb37b02bdf5da060b83c71112bd9ac91bed37c Mon Sep 17 00:00:00 2001 From: Jenkins Date: Thu, 5 Dec 2024 21:29:38 +0000 Subject: [PATCH 03/18] Automated API docs swagger to md conversion (https://jenkins.algol60.net/job/Cray-HPE/job/csm/job/v1.6.1-alpha.5/1/) --- api/cfs.md | 296 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 249 insertions(+), 47 deletions(-) diff --git a/api/cfs.md b/api/cfs.md index cc19803e5518..191504b04bdf 100644 --- a/api/cfs.md +++ b/api/cfs.md @@ -614,8 +614,8 @@ Retrieve the list of configuration service options. "additionalInventoryUrl": "https://api-gw-service-nmn.local/vcs/cray/inventory.git", "batcherMaxBackoff": 3600, "batcherDisable": true, - "batcherPendingTimeout": 0, - "loggingLevel": "string" + "batcherPendingTimeout": 1, + "loggingLevel": "DEBUG" } ``` @@ -715,8 +715,8 @@ Update one or more of the configuration service options. "additionalInventoryUrl": "https://api-gw-service-nmn.local/vcs/cray/inventory.git", "batcherMaxBackoff": 3600, "batcherDisable": true, - "batcherPendingTimeout": 0, - "loggingLevel": "string" + "batcherPendingTimeout": 1, + "loggingLevel": "DEBUG" } ``` @@ -743,8 +743,8 @@ Update one or more of the configuration service options. "additionalInventoryUrl": "https://api-gw-service-nmn.local/vcs/cray/inventory.git", "batcherMaxBackoff": 3600, "batcherDisable": true, - "batcherPendingTimeout": 0, - "loggingLevel": "string" + "batcherPendingTimeout": 1, + "loggingLevel": "DEBUG" } ``` @@ -2933,8 +2933,8 @@ Update the state for a collection of components in the cfs database > Body parameter ```json -{ - "patch": { +[ + { "id": "string", "state": [ { @@ -2958,22 +2958,15 @@ Update the state for a collection of components in the cfs database "property1": "string", "property2": "string" } - }, - "filters": { - "ids": "string", - "status": "unconfigured", - "enabled": true, - "configName": "string", - "tags": "string" } -} +] ```

Parameters

|Name|In|Type|Required|Description| |---|---|---|---|---| -|body|body|any|true|The configuration/state for an array of components| +|body|body|[V2ComponentStateArray](#schemav2componentstatearray)|true|The configuration/state for an array of components| > Example responses @@ -3950,8 +3943,8 @@ Update the state for a collection of components in the cfs database > Body parameter ```json -{ - "patch": { +[ + { "id": "string", "state": [ { @@ -3977,22 +3970,15 @@ Update the state for a collection of components in the cfs database "property1": "string", "property2": "string" } - }, - "filters": { - "ids": "string", - "status": "unconfigured", - "enabled": true, - "config_name": "string", - "tags": "string" } -} +] ```

Parameters

|Name|In|Type|Required|Description| |---|---|---|---|---| -|body|body|any|true|The configuration/state for an array of components| +|body|body|[V3ComponentDataArray](#schemav3componentdataarray)|true|The configuration/state for an array of components| > Example responses @@ -6387,6 +6373,134 @@ To perform this operation, you must be authenticated by means of one of the foll bearerAuth +## restore_source_v3 + + + +> Code samples + +```http +POST https://api-gw-service-nmn.local/apis/cfs/v3/sources/{source_id} HTTP/1.1 +Host: api-gw-service-nmn.local +Content-Type: application/json +Accept: application/json + +``` + +```shell +# You can also use wget +curl -X POST https://api-gw-service-nmn.local/apis/cfs/v3/sources/{source_id} \ + -H 'Content-Type: application/json' \ + -H 'Accept: application/json' \ + -H 'Authorization: Bearer {access-token}' + +``` + +```python +import requests +headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'Authorization': 'Bearer {access-token}' +} + +r = requests.post('https://api-gw-service-nmn.local/apis/cfs/v3/sources/{source_id}', headers = headers) + +print(r.json()) + +``` + +```go +package main + +import ( + "bytes" + "net/http" +) + +func main() { + + headers := map[string][]string{ + "Content-Type": []string{"application/json"}, + "Accept": []string{"application/json"}, + "Authorization": []string{"Bearer {access-token}"}, + } + + data := bytes.NewBuffer([]byte{jsonReq}) + req, err := http.NewRequest("POST", "https://api-gw-service-nmn.local/apis/cfs/v3/sources/{source_id}", data) + req.Header = headers + + client := &http.Client{} + resp, err := client.Do(req) + // ... +} + +``` + +`POST /v3/sources/{source_id}` + +*Restore a source* + +Restore a CFS source by providing the name of the Vault secret that contains the credentials. This does NOT create the secret in Vault, nor does it validate that the secret exists. This is intended to be used to restore CFS data in the case that it is lost or corrupted. This will NOT recover a source that has been deleted using CFS, because when CFS deletes a source, it also deletes its corresponding Vault secret. NOTE: This action is not available prior to CFS 1.23 + +> Body parameter + +```json +{ + "description": "string", + "clone_url": "string", + "credentials": { + "authentication_method": "password", + "secret_name": "string" + }, + "ca_cert": { + "configmap_name": "string", + "configmap_namespace": "string" + } +} +``` + +

Parameters

+ +|Name|In|Type|Required|Description| +|---|---|---|---|---| +|body|body|[V3SourceRestoreData](#schemav3sourcerestoredata)|true|A source| +|source_id|path|string|true|Name of the target source| + +> Example responses + +> 201 Response + +```json +{ + "name": "sample-source", + "description": "string", + "last_updated": "2019-07-28T03:26:00Z", + "clone_url": "string", + "credentials": { + "authentication_method": "password", + "secret_name": "string" + }, + "ca_cert": { + "configmap_name": "string", + "configmap_namespace": "string" + } +} +``` + +

Responses

+ +|Status|Meaning|Description|Schema| +|---|---|---|---| +|201|[Created](https://tools.ietf.org/html/rfc7231#section-6.3.2)|A single source|[V3SourceData](#schemav3sourcedata)| +|400|[Bad Request](https://tools.ietf.org/html/rfc7231#section-6.5.1)|Bad Request|[ProblemDetails](#schemaproblemdetails)| +|409|[Conflict](https://tools.ietf.org/html/rfc7231#section-6.5.8)|A source with the same name already exists|[ProblemDetails](#schemaproblemdetails)| + + + ## delete_source_v3 @@ -6582,8 +6696,8 @@ Information for requesting the next page of data "additionalInventoryUrl": "https://api-gw-service-nmn.local/vcs/cray/inventory.git", "batcherMaxBackoff": 3600, "batcherDisable": true, - "batcherPendingTimeout": 0, - "loggingLevel": "string" + "batcherPendingTimeout": 1, + "loggingLevel": "DEBUG" } ``` @@ -6594,19 +6708,19 @@ Configuration options for the configuration service. |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| -|hardwareSyncInterval|integer|false|none|How frequently the CFS hardware-sync-agent checks with the Hardware State Manager to update its known hardware (in seconds)| -|batcherCheckInterval|integer|false|none|How frequently the batcher checks the configuration states to see if work needs to be done (in seconds)| -|batchSize|integer|false|none|The maximum number of nodes the batcher will run a single CFS session against.| -|batchWindow|integer|false|none|The maximum number of seconds the batcher will wait to run a CFS session once a node has been detected that needs configuration.| -|defaultBatcherRetryPolicy|integer|false|none|The default maximum number retries per node when configuration fails.| +|hardwareSyncInterval|[V3Options/properties/hardware_sync_interval](#schemav3options/properties/hardware_sync_interval)|false|none|How frequently the CFS hardware-sync-agent checks with the Hardware State Manager to update its known hardware (in seconds)| +|batcherCheckInterval|[V3Options/properties/batcher_check_interval](#schemav3options/properties/batcher_check_interval)|false|none|How frequently the batcher checks the configuration states to see if work needs to be done (in seconds)| +|batchSize|[V3Options/properties/batch_size](#schemav3options/properties/batch_size)|false|none|The maximum number of nodes the batcher will run a single CFS session against.| +|batchWindow|[V3Options/properties/batch_window](#schemav3options/properties/batch_window)|false|none|The maximum number of seconds the batcher will wait to run a CFS session once a node has been detected that needs configuration.| +|defaultBatcherRetryPolicy|[V3Options/properties/default_batcher_retry_policy](#schemav3options/properties/default_batcher_retry_policy)|false|none|The default maximum number retries per node when configuration fails.| |defaultPlaybook|string|false|none|The default playbook to be used if not specified in a node's desired state.| -|defaultAnsibleConfig|string|false|none|The Kubernetes ConfigMap which holds the default ansible.cfg for a given CFS session. This ConfigMap must be present in the same Kubernetes namespace as the CFS service.| -|sessionTTL|string|false|none|A time-to-live applied to all completed CFS sessions. Specified in minutes, hours, days, or weeks. e.g. 3d or 24h. Set to an empty string to disable.| -|additionalInventoryUrl|string|false|none|The git clone URL of a repo with additional inventory files. All files in the repo will be copied into the hosts directory of CFS.| -|batcherMaxBackoff|integer|false|none|The maximum number of seconds that batcher will backoff from session creation if problems are detected.| -|batcherDisable|boolean|false|none|Disables cfs-batcher's automatic session creation if set to True.| -|batcherPendingTimeout|integer|false|none|How long cfs-batcher will wait on a pending session before deleting and recreating it (in seconds).| -|loggingLevel|string|false|none|The logging level for core CFS services. This does not affect the Ansible logging level.| +|defaultAnsibleConfig|[V3Options/properties/default_ansible_config](#schemav3options/properties/default_ansible_config)|false|none|The Kubernetes ConfigMap which holds the default ansible.cfg for a given CFS session. This ConfigMap must be present in the same Kubernetes namespace as the CFS service.| +|sessionTTL|[V3Options/properties/session_ttl](#schemav3options/properties/session_ttl)|false|none|A time-to-live applied to all completed CFS sessions. Specified in minutes, hours, days, or weeks. e.g. 3d or 24h. Set to an empty string to disable.| +|additionalInventoryUrl|[V3Options/properties/additional_inventory_url](#schemav3options/properties/additional_inventory_url)|false|none|The git clone URL of a repo with additional inventory files. All files in the repo will be copied into the hosts directory of CFS. This is mutually exclusive with the additional_inventory_source option and only one can be set.| +|batcherMaxBackoff|[V3Options/properties/batcher_max_backoff](#schemav3options/properties/batcher_max_backoff)|false|none|The maximum number of seconds that batcher will backoff from session creation if problems are detected.| +|batcherDisable|[V3Options/properties/batcher_disable](#schemav3options/properties/batcher_disable)|false|none|Disables cfs-batcher's automatic session creation if set to True.| +|batcherPendingTimeout|[V3Options/properties/batcher_pending_timeout](#schemav3options/properties/batcher_pending_timeout)|false|none|How long cfs-batcher will wait on a pending session before deleting and recreating it (in seconds).| +|loggingLevel|[V3Options/properties/logging_level](#schemav3options/properties/logging_level)|false|none|The logging level for core CFS services. This does not affect the Ansible logging level.|

V3Options

@@ -7477,13 +7591,15 @@ An inventory reference to include in a set of configurations. ``` An inventory reference to include in a set of configurations. +Either clone_url or source must be specified -- it is required to specify one, +but they are mutually exclusive. ### Properties |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| |name|string|false|none|The name of the inventory layer.| -|clone_url|string|true|none|The clone URL of the configuration content repository.| +|clone_url|string|false|none|The clone URL of the configuration content repository.| |source|string|false|none|A CFS source with directions to the configuration content repository| |commit|string|false|none|The commit hash of the configuration repository when the state is set.| |branch|string|false|none|The repository branch to use. This will automatically set `commit` to master on the branch
when the configuration is added.| @@ -7546,6 +7662,8 @@ A single desired configuration state for a component. ``` A single desired configuration state for a component. +Either clone_url or source must be specified -- it is required to specify one, +but they are mutually exclusive. ### Properties @@ -7652,7 +7770,7 @@ A collection of ConfigurationLayers. |description|string|false|none|A user-defined description. This field is not used by CFS.| |last_updated|string(date-time)|false|read-only|The date/time when the state was last updated in RFC 3339 format.| |layers|[[V3ConfigurationLayer](#schemav3configurationlayer)]|false|none|A list of ConfigurationLayer(s).| -|additional_inventory|[V3AdditionalInventoryLayer](#schemav3additionalinventorylayer)|false|none|An inventory reference to include in a set of configurations.| +|additional_inventory|[V3AdditionalInventoryLayer](#schemav3additionalinventorylayer)|false|none|An inventory reference to include in a set of configurations.
Either clone_url or source must be specified -- it is required to specify one,
but they are mutually exclusive.|

V2ConfigurationArray

@@ -7751,6 +7869,26 @@ A collection of configuration data. |configurations|[[V3ConfigurationData](#schemav3configurationdata)]|false|none|[A collection of ConfigurationLayers.]| |next|[V3NextData](#schemav3nextdata)|false|none|Information for requesting the next page of data| +

ComponentId

+ + + + + + +```json +"string" + +``` + +The component's id. e.g. xname for hardware components + +### Properties + +|Name|Type|Required|Restrictions|Description| +|---|---|---|---|---| +|*anonymous*|string|false|none|The component's id. e.g. xname for hardware components| +

V2ComponentsFilter

@@ -7953,7 +8091,7 @@ The configuration state and desired state for a component. |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| -|id|string|false|none|The component's id. e.g. xname for hardware components| +|id|[ComponentId](#schemacomponentid)|false|none|The component's id. e.g. xname for hardware components| |state|[[V2ConfigurationStateLayer](#schemav2configurationstatelayer)]|false|none|Information about the desired config and status of the layers| |stateAppend|object|false|write-only|A single state that will be appended to the list of current states.| |» cloneUrl|string|false|none|The clone URL of the configuration content repository.| @@ -8035,7 +8173,7 @@ The configuration state and desired state for a component. |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| -|id|string|false|none|The component's id. e.g. xname for hardware components| +|id|[ComponentId](#schemacomponentid)|false|none|The component's id. e.g. xname for hardware components| |state|[[V3ConfigurationStateLayer](#schemav3configurationstatelayer)]|false|none|Information about the desired config and status of the layers| |state_append|object|false|write-only|A single state that will be appended to the list of current states.| |» clone_url|string|false|none|The clone URL of the configuration content repository.| @@ -8273,7 +8411,7 @@ A collection of component ids. |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| -|component_ids|[string]|false|none|none| +|component_ids|[[ComponentId](#schemacomponentid)]|false|none|[The component's id. e.g. xname for hardware components]|

V2ComponentsUpdate

@@ -8583,6 +8721,36 @@ Information for retrieving the git credentials |---|---| |authentication_method|password| +

V3SourceRestoreCredentials

+ + + + + + +```json +{ + "authentication_method": "password", + "secret_name": "string" +} + +``` + +Information on a secret containing the username and password for accessing the git content + +### Properties + +|Name|Type|Required|Restrictions|Description| +|---|---|---|---|---| +|authentication_method|string|true|none|The git authentication method used.| +|secret_name|string|true|none|The name of the credentials vault secret.| + +#### Enumerated Values + +|Property|Value| +|---|---| +|authentication_method|password| +

V3SourceCreateData

@@ -8620,6 +8788,40 @@ Information for retrieving git content from a source. |credentials|[V3SourceCreateCredentials](#schemav3sourcecreatecredentials)|true|none|Information for retrieving the git credentials| |ca_cert|[V3SourceCert](#schemav3sourcecert)|false|none|Information on a configmap containing a CA certificate for authenticating to git| +

V3SourceRestoreData

+ + + + + + +```json +{ + "description": "string", + "clone_url": "string", + "credentials": { + "authentication_method": "password", + "secret_name": "string" + }, + "ca_cert": { + "configmap_name": "string", + "configmap_namespace": "string" + } +} + +``` + +Information for retrieving git content from a source. + +### Properties + +|Name|Type|Required|Restrictions|Description| +|---|---|---|---|---| +|description|string|false|none|A user-defined description. This field is not used by CFS.| +|clone_url|string|true|none|The url to access the git content| +|credentials|[V3SourceRestoreCredentials](#schemav3sourcerestorecredentials)|true|none|Information on a secret containing the username and password for accessing the git content| +|ca_cert|[V3SourceCert](#schemav3sourcecert)|false|none|Information on a configmap containing a CA certificate for authenticating to git| +

V3SourceUpdateData

From 0a051c37b114d7b96ef3d1f793a8746c92678ea7 Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen Date: Thu, 21 Nov 2024 09:49:34 -0600 Subject: [PATCH 04/18] CASMINST-7041 add manual NCN upgrade documentation for use in case of emergencies --- troubleshooting/README.md | 4 +- ...grade_Management_Nodes_and_CSM_Services.md | 4 +- upgrade/manual_ncn_upgrade.md | 256 ++++++++++++++++++ 3 files changed, 259 insertions(+), 5 deletions(-) create mode 100644 upgrade/manual_ncn_upgrade.md diff --git a/troubleshooting/README.md b/troubleshooting/README.md index b3d3a9eac334..cbbff8f9d4a1 100644 --- a/troubleshooting/README.md +++ b/troubleshooting/README.md @@ -71,7 +71,6 @@ to the exiting problem seen into the existing search. (The example searches for * [IMS image delete loses the `arch` information](known_issues/ims_image_delete_loses_arch.md) * [Spire pods stuck in `PodInitializing`](known_issues/spire_pod_initializing.md) * [CFS Component With Zero-Length ID](known_issues/CFS_Component_With_Zero_Length_ID.md) -* [IMS Remote Node Image Build Failure](known_issues/ims_remote_node_image_build_failure.md) ## Booting @@ -139,7 +138,6 @@ to the exiting problem seen into the existing search. (The example searches for * [Restore Postgres](../operations/kubernetes/Restore_Postgres.md) * [Disaster Recovery for Postgres](../operations/kubernetes/Disaster_Recovery_Postgres.md) * [Postgres Database is in Recovery](known_issues/postgres_database_recovery.md) -* [Kubernetes Pods Failing to Mount PVCs](kubernetes/Kubernetes_Pods_Failing_to_Mount_PVCs.md) ## MetalLB @@ -151,7 +149,7 @@ to the exiting problem seen into the existing search. (The example searches for * [Issues with Redfish Endpoint `DiscoveryCheck` for Redfish Events from Nodes](../operations/node_management/Troubleshoot_Issues_with_Redfish_Endpoint_Discovery.md) * [Interfaces with IP Address Issues](../operations/node_management/Troubleshoot_Interfaces_with_IP_Address_Issues.md) * [Loss of Console Connections and Logs on Gigabyte Nodes](../operations/node_management/Troubleshoot_Loss_of_Console_Connections_and_Logs_on_Gigabyte_Nodes.md) -* [Image projection inconsistent across nodes](image_projection_inconsistent_across_nodes.md) +* [Need to manually upgrade an NCN](../upgrade/manual_ncn_upgrade.md) ## Security and authentication diff --git a/upgrade/Upgrade_Management_Nodes_and_CSM_Services.md b/upgrade/Upgrade_Management_Nodes_and_CSM_Services.md index 70474d2698c2..7df5ae089d1d 100644 --- a/upgrade/Upgrade_Management_Nodes_and_CSM_Services.md +++ b/upgrade/Upgrade_Management_Nodes_and_CSM_Services.md @@ -32,10 +32,10 @@ For additional reference material on the upgrade processes and scripts mentioned The upgrade to CSM 1.6 is done through IUF. Follow one of the following two procedures: -1. [Upgrade only CSM](./Upgrade_Only_CSM_with_iuf.md) - 1. [Upgrade CSM and additional products with IUF](../operations/iuf/workflows/upgrade_csm_and_additional_products_with_iuf.md) +1. [Upgrade only CSM](./Upgrade_Only_CSM_with_iuf.md) + **Important:** Take note of the below content for troubleshooting purposes, in the event that issues are encountered during the upgrade process. ## Relevant troubleshooting links for upgrade-related issues diff --git a/upgrade/manual_ncn_upgrade.md b/upgrade/manual_ncn_upgrade.md new file mode 100644 index 000000000000..7da6fda38a40 --- /dev/null +++ b/upgrade/manual_ncn_upgrade.md @@ -0,0 +1,256 @@ +# Manual NCN upgrade + +**This page should NOT be used for a normal CSM upgrade.** + +This page provides instructions for doing a manual upgrade of NCN nodes. +There is a section for upgrading worker nodes, storage nodes, and master nodes. +All NCN upgrades in CSM 1.6 and later are done through IUF. +This page provides instructions in case a manual NCN upgrade is needed for an unusual situation. +As we have removed all of our manual CSM upgrade documentation, it is important that we have documentation +describing this process in case of an emergency. + +If you are performing a regular CSM upgrade, this should be done through IUF. +Follow [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade. + +## Manually Upgrade NCNs + +- [Storage node manual upgrade](#storage-node-manual-upgrade) +- [Worker node manual upgrade](#worker-node-manual-upgrade) +- [Master node manual upgrade](#master-node-manual-upgrade) + +### Storage node manual upgrade + +**In CSM 1.6 and later, the storage node upgrades should be executed by IUF. +See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** + +Storage node upgrades are done using an IUF Argo workflow. See [using the Argo UI](../operations/argo/Using_the_Argo_UI.md) to access the UI and [using Argo workflows](../operations/argo/Using_Argo_Workflows.md) for more information about Argo workflows. + +1. (`ncn-m001#`) Set the storage node name for the node that is being upgraded. + + ```bash + storage_node=ncn-s00x + ``` + +1. (`ncn-m001#`) Execute the storage node upgrade. + + > **NOTE:** If `--image-id` and/or `--desired-cfs-conf` is not supplied, then the storage node will be upgraded to the image that is already set in BSS and the CFS configuration already set in CFS. + > Additionally, the `--image-id` and `--desired-cfs-conf` can be set manually in BSS and in CFS respectively. + > See [set the image ID and CFS configuration manually](#set-the-image-id-and-cfs-configuration-manually) for the manual process. + > If the manual process is used, then omit `--image-id` and `--desired-cfs-conf` from the command below. + + ```bash + /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-worker-storage-nodes.sh $storage_node --upgrade --image-id $image --desired-cfs-conf $configuration + ``` + +### Worker node manual upgrade + +**In CSM 1.6 and later, the worker node upgrades should be executed by IUF. +See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** + +Worker node upgrades are done using an IUF Argo workflow. See [using the Argo UI](../operations/argo/Using_the_Argo_UI.md) to access the UI and [using Argo workflows](../operations/argo/Using_Argo_Workflows.md) for more information about Argo workflows. + +1. (`ncn-m001#`) Set a worker node name for the node that is being upgraded. + + ```bash + worker_node=ncn-w00x + ``` + +1. (`ncn-m001#`) Execute a worker node upgrade. + + > **NOTE:** If `--image-id` and/or `--desired-cfs-conf` is not supplied, then the worker node will be upgraded to the image that is already set in BSS and the CFS configuration already set in CFS. + > Additionally, the `--image-id` and `--desired-cfs-conf` can be set manually in BSS and in CFS respectively. + > See [set the image ID and CFS configuration manually](#set-the-image-id-and-cfs-configuration-manually) for the manual process. + > If the manual process is used, then omit `--image-id` and `--desired-cfs-conf` from the command below. + + ```bash + /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-worker-storage-nodes.sh $worker_node --image-id $image --desired-cfs-conf $configuration + ``` + +### Master node manual upgrade + +**In CSM 1.6 and later, the master node upgrades should be executed by IUF. +See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** + +A master node upgrade is not executed by Argo workflows, instead the master node upgrade is a bash script. +IUF can use Argo workflows to execute a master node upgrade but it does this by executing the master node upgrade script. +The script keeps tracks of steps that have been completed and prints them to a state file in the `/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/` directory on the node where the script is executed. +If the master node upgrade fails partway through, it is safe to re-execute the upgrade script because the state is being tracked and steps will not be re-executed if they have already run successfully. + +There are two different processes for upgrading master nodes depending on if `ncn-m001` is being upgraded or if `ncn-m002` or `ncn-m003` is being upgraded. + +Follow one of two procedures below. + +- [Manually upgrade `ncn-m002` or `ncn-m003`](#manually-upgrade-ncn-m002-or-ncn-m003) +- [Manually upgrade `ncn-m001`](#manually-upgrade-ncn-m001) + +#### Manually upgrade `ncn-m002` or `ncn-m003` + +> **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../operations/kubernetes/encryption/README.md), +then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded. + +1. (`ncn-m001#`) Start a typescript. + + ```bash + script -af /root/csm_upgrade.$(date +%Y%m%d_%H%M%S).upgrade-m0023.txt + export PS1='\u@\H \D{%Y-%m-%d} \t \w # ' + ``` + +1. [Set the image ID and CFS configuration manually.](#set-the-image-id-and-cfs-configuration-manually) + +1. (`ncn-m001#`) Set upgrade variables. + + ```bash + source /etc/cray/upgrade/csm/myenv + ``` + +1. (`ncn-m001#`) Set the master node name for the node that is being upgraded (`ncn-m002` or `ncn-m003`). + + ```bash + master_node=ncn-m00x + ``` + +1. (`ncn-m001#`) Run `ncn-upgrade-master-nodes.sh` for `ncn-m002` or `ncn-m003`. + + ```bash + /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh $master_node + ``` + + > **`NOTE`** The `root` user password for the node may need to be reset after it is rebooted. Additionally, the `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up. + Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted. + See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`. + +#### Manually upgrade `ncn-m001` + +To manually upgrade `ncn-m001`, the CFS configuration and node image need to be set for `ncn-m001`, the artifacts on `ncn-m001` need to be backed up, `ncn-m002` needs to be prepared to execute the upgrade, and the `ncn-m001` upgrade needs to be executed. +Follow the steps below to upgrade `ncn-m001`. + +1. [Set the image ID and CFS configuration manually.](#set-the-image-id-and-cfs-configuration-manually) + +1. (`ncn-m001#`) Create an archive of the artifacts. + + ```bash + BACKUP_TARFILE="csm_upgrade.pre_m001_reboot_artifacts.$(date +%Y%m%d_%H%M%S).tgz" + ls -d \ + /root/apply_csm_configuration.* \ + /root/csm_upgrade.* \ + /root/output.log 2>/dev/null | + sed 's_^/__' | + xargs tar -C / -czvf "/root/${BACKUP_TARFILE}" + ``` + +1. (`ncn-m001#`) Upload the archive to S3 in the cluster. + + ```bash + cray artifacts create config-data "${BACKUP_TARFILE}" "/root/${BACKUP_TARFILE}" + ``` + +1. Log out of `ncn-m001`. + +1. Log in to `ncn-m002` from outside the cluster. + + > **`NOTE`** Very rarely, a password hash for the `root` user that works properly on a SLES SP2 NCN is + > not recognized on a SLES SP3 NCN. If password login fails, then log in to `ncn-m002` from + > `ncn-m001` and use the `passwd` command to reset the password. Then log in using the CMN IP address as directed + > below. Once `ncn-m001` has been upgraded, log in from `ncn-m002` and use the `passwd` command to reset + > the password. The other NCNs will have their passwords updated when NCN personalization is run in a + > subsequent step. + + `ssh` to the `bond0.cmn0`/CMN IP address of `ncn-m002`. + +1. (`ncn-m002#`) Start a typescript. + + ```bash + script -af /root/csm_upgrade.$(date +%Y%m%d_%H%M%S).upgrade-m001.txt + export PS1='\u@\H \D{%Y-%m-%d} \t \w # ' + ``` + +1. Authenticate with the Cray CLI on `ncn-m002`. + + See [Configure the Cray Command Line Interface](../operations/configure_cray_cli.md) for details on how to do this. + +1. (`ncn-m002#`) Set upgrade variables. + + ```bash + source /etc/cray/upgrade/csm/myenv + ``` + +1. (`ncn-m002#`) Copy artifacts from `ncn-m001`. + + > A later stage of the upgrade expects the `docs-csm` and `libcsm` RPMs to be located at `/root/` on `ncn-m002`; + > that is why this command copies them there. + + - Install `csi` and `docs-csm`. + + ```bash + scp ncn-m001:/root/csm_upgrade.pre_m001_reboot_artifacts.*.tgz /root + zypper --plus-repo="/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/tarball/csm-${CSM_RELEASE}/rpm/cray/csm/sle-$(awk -F= '/VERSION=/{gsub(/["-]/, "") ; print tolower($NF)}' /etc/os-release)" --no-gpg-checks install -y cray-site-init + scp ncn-m001:/root/*.noarch.rpm /root/ + rpm -Uvh --force /root/docs-csm-latest.noarch.rpm + ``` + + - Install `libcsm`. + + > ***NOTE*** Since `libcsm` depends on versions of Python relative to what is included in the SLES service packs, + > then in the event that `ncn-m002` is running a newer SLES distro a new `libcsm` must be downloaded. This will + > often be the case when jumping to a new CSM minor version (e.g. CSM 1.3 to CSM 1.4). + > e.g. if `ncn-m001` is running SLES15SP3, and `ncn-m002` is running SLES15SP4 then the SLES15SP4 `libcsm` is needed. + > Follow the [Check for latest documentation](../update_product_stream/README.md#check-for-latest-documentation) + > guide again, but from `ncn-m002`. + + ```bash + rpm -Uvh --force /root/libcsm-latest.noarch.rpm + ``` + +1. (`ncn-m002#`) Upgrade `ncn-m001`. + + > **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../operations/kubernetes/encryption/README.md), + then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded. + + ```bash + /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh ncn-m001 + ``` + + > **`NOTE`** The `root` user password for the node may need to be reset after it is rebooted. + Additionally, the `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up. + Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted. + See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`. + +### Set the image ID and CFS configuration manually + +(`ncn-m001#`) Set `XNAME` to the xname of the node that is being upgraded. + +```bash +XNAME= +``` + +#### Set the image ID in BSS + +1. (`ncn-m001#`) Set `IMS_IMAGE_ID` to the image ID that should be upgraded to. + + ```bash + IMS_IMAGE_ID= + ``` + +1. (`ncn-m001#`) Set the image ID in BSS. + + ```bash + /usr/share/doc/csm/scripts/operations/node_management/assign-ncn-images.sh -p $IMS_IMAGE_ID $XNAME + ``` + +#### Set the CFS configuration in CFS + +The following steps will update the node's desired configuration but will leave it disabled. +It will automatically enable and be applied after the node is upgraded. + +1. (`ncn-m001#`) Set `CFS_CONFIG_NAME` to the configuration that should be used once the node has been upgraded. + + ```bash + CFS_CONFIG_NAME= + ``` + +1. (`ncn-m001#`) Set CFS configuration. + + ```bash + /usr/share/doc/csm/scripts/operations/configuration/apply_csm_configuration.sh \ + --no-config-change --no-enable --config-name $CFS_CONFIG_NAME --xnames $XNAME + ``` From a1cdf74fc54de958f09ef1246517e11558a3abc6 Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:26:05 -0600 Subject: [PATCH 05/18] CASMINST-7041 Update troubleshooting/README.md with correct file name for Manual NCN upgrade Co-authored-by: Russell Bunch Signed-off-by: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> --- troubleshooting/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/troubleshooting/README.md b/troubleshooting/README.md index cbbff8f9d4a1..4d0f04e921fd 100644 --- a/troubleshooting/README.md +++ b/troubleshooting/README.md @@ -149,7 +149,7 @@ to the exiting problem seen into the existing search. (The example searches for * [Issues with Redfish Endpoint `DiscoveryCheck` for Redfish Events from Nodes](../operations/node_management/Troubleshoot_Issues_with_Redfish_Endpoint_Discovery.md) * [Interfaces with IP Address Issues](../operations/node_management/Troubleshoot_Interfaces_with_IP_Address_Issues.md) * [Loss of Console Connections and Logs on Gigabyte Nodes](../operations/node_management/Troubleshoot_Loss_of_Console_Connections_and_Logs_on_Gigabyte_Nodes.md) -* [Need to manually upgrade an NCN](../upgrade/manual_ncn_upgrade.md) +* [Manual NCN Upgrade](../upgrade/manual_ncn_upgrade.md) ## Security and authentication From af223fa1b5be6c52af741392cc9228fffb9b2cbf Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:54:07 -0600 Subject: [PATCH 06/18] CASMINST-7041 Apply formatting improvement suggestions from code review Co-authored-by: Mitch Harding Signed-off-by: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> --- upgrade/manual_ncn_upgrade.md | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/upgrade/manual_ncn_upgrade.md b/upgrade/manual_ncn_upgrade.md index 7da6fda38a40..b8f3cfc848ee 100644 --- a/upgrade/manual_ncn_upgrade.md +++ b/upgrade/manual_ncn_upgrade.md @@ -12,13 +12,11 @@ describing this process in case of an emergency. If you are performing a regular CSM upgrade, this should be done through IUF. Follow [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade. -## Manually Upgrade NCNs - - [Storage node manual upgrade](#storage-node-manual-upgrade) - [Worker node manual upgrade](#worker-node-manual-upgrade) - [Master node manual upgrade](#master-node-manual-upgrade) -### Storage node manual upgrade +## Storage node manual upgrade **In CSM 1.6 and later, the storage node upgrades should be executed by IUF. See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** @@ -42,7 +40,7 @@ Storage node upgrades are done using an IUF Argo workflow. See [using the Argo U /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-worker-storage-nodes.sh $storage_node --upgrade --image-id $image --desired-cfs-conf $configuration ``` -### Worker node manual upgrade +## Worker node manual upgrade **In CSM 1.6 and later, the worker node upgrades should be executed by IUF. See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** @@ -66,14 +64,14 @@ Worker node upgrades are done using an IUF Argo workflow. See [using the Argo UI /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-worker-storage-nodes.sh $worker_node --image-id $image --desired-cfs-conf $configuration ``` -### Master node manual upgrade +## Master node manual upgrade **In CSM 1.6 and later, the master node upgrades should be executed by IUF. See [upgrade management nodes and CSM services](../upgrade/Upgrade_Management_Nodes_and_CSM_Services.md) to perform a normal CSM upgrade.** A master node upgrade is not executed by Argo workflows, instead the master node upgrade is a bash script. IUF can use Argo workflows to execute a master node upgrade but it does this by executing the master node upgrade script. -The script keeps tracks of steps that have been completed and prints them to a state file in the `/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/` directory on the node where the script is executed. +The script keeps track of steps that have been completed and prints them to a state file in the `/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/` directory on the node where the script is executed. If the master node upgrade fails partway through, it is safe to re-execute the upgrade script because the state is being tracked and steps will not be re-executed if they have already run successfully. There are two different processes for upgrading master nodes depending on if `ncn-m001` is being upgraded or if `ncn-m002` or `ncn-m003` is being upgraded. @@ -83,7 +81,7 @@ Follow one of two procedures below. - [Manually upgrade `ncn-m002` or `ncn-m003`](#manually-upgrade-ncn-m002-or-ncn-m003) - [Manually upgrade `ncn-m001`](#manually-upgrade-ncn-m001) -#### Manually upgrade `ncn-m002` or `ncn-m003` +### Manually upgrade `ncn-m002` or `ncn-m003` > **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../operations/kubernetes/encryption/README.md), then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded. @@ -119,7 +117,7 @@ then backup the `/etc/cray/kubernetes/encryption` directory on the master node b Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted. See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`. -#### Manually upgrade `ncn-m001` +### Manually upgrade `ncn-m001` To manually upgrade `ncn-m001`, the CFS configuration and node image need to be set for `ncn-m001`, the artifacts on `ncn-m001` need to be backed up, `ncn-m002` needs to be prepared to execute the upgrade, and the `ncn-m001` upgrade needs to be executed. Follow the steps below to upgrade `ncn-m001`. @@ -215,7 +213,7 @@ Follow the steps below to upgrade `ncn-m001`. Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted. See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`. -### Set the image ID and CFS configuration manually +## Set the image ID and CFS configuration manually (`ncn-m001#`) Set `XNAME` to the xname of the node that is being upgraded. @@ -223,7 +221,7 @@ Follow the steps below to upgrade `ncn-m001`. XNAME= ``` -#### Set the image ID in BSS +### Set the image ID in BSS 1. (`ncn-m001#`) Set `IMS_IMAGE_ID` to the image ID that should be upgraded to. @@ -237,7 +235,7 @@ XNAME= /usr/share/doc/csm/scripts/operations/node_management/assign-ncn-images.sh -p $IMS_IMAGE_ID $XNAME ``` -#### Set the CFS configuration in CFS +### Set the CFS configuration in CFS The following steps will update the node's desired configuration but will leave it disabled. It will automatically enable and be applied after the node is upgraded. From d9ea6b41240032f1f248caa5b626412799d74af3 Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:03:40 -0600 Subject: [PATCH 07/18] CASMINST-7088-release-1.6 small docs adjustment to IUF deliver product when installing USS 1.1 or higher (#5589) * CASMINST-7088-release-1.6 small docs adjustment to IUF deliver product when installing USS 1.1 or higher * syntax highlighting --------- Co-authored-by: Russell Bunch --- operations/iuf/workflows/admin_directory.md | 4 ++-- operations/iuf/workflows/product_delivery.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/operations/iuf/workflows/admin_directory.md b/operations/iuf/workflows/admin_directory.md index 8ad8f317c7f5..3f210bf494ff 100644 --- a/operations/iuf/workflows/admin_directory.md +++ b/operations/iuf/workflows/admin_directory.md @@ -98,7 +98,7 @@ the HPC CSM Software Recipe with the existing content in `${ADMIN_DIR}`. Example output: - ```text + ```yaml default: network_type: "cassini" suffix: "-test01" @@ -106,7 +106,7 @@ the HPC CSM Software Recipe with the existing content in `${ADMIN_DIR}`. site_domain: "my-site-domain.net" uss: deploy_slurm: true - deploy_pbs: true + deploy_pbs: false ``` 3. Ensure the expected files are present in the admin directory after performing the steps in this section. diff --git a/operations/iuf/workflows/product_delivery.md b/operations/iuf/workflows/product_delivery.md index 70b9001ee686..795d73a0bfb4 100644 --- a/operations/iuf/workflows/product_delivery.md +++ b/operations/iuf/workflows/product_delivery.md @@ -67,8 +67,8 @@ Refer to that table and any corresponding product documents before continuing to Additional arguments are available to control the behavior of the `deliver-product` stage (for example, `-rv`). See the [`deliver-product` stage documentation](../stages/deliver_product.md) for details and adjust the example below if necessary. - **`NOTE`** When installing USS 1.1 or higher, select either SLURM or PBS Pro Products to use on the system before running this stage. For more information, see the `deliver-product` stage - details in the "Install and Upgrade Framework" section of the _HPE Cray Supercomputing User Services Software Administration Guide: CSM on HPE Cray Supercomputing EX Systems (S-8063)_. + **`NOTE`** When installing USS 1.1 or higher, select either Slurm or PBS Pro Products to use on the system before running this stage. This should be specified in `site_vars.yaml`. + For more information, see the `deliver-product` stage details in the "Install and Upgrade Framework" section of the _HPE Cray Supercomputing User Services Software Administration Guide: CSM on HPE Cray Supercomputing EX Systems (S-8063)_. (`ncn-m001#`) Execute the `deliver-product` stage. Use site variables from the `site_vars.yaml` file found in `${ADMIN_DIR}` and recipe variables from the `product_vars.yaml` file found in `${ADMIN_DIR}`. From 98f490a0132797190b44b442238f5b8e91e58ff2 Mon Sep 17 00:00:00 2001 From: Nathan Rockershousen Date: Fri, 22 Nov 2024 08:07:32 -0600 Subject: [PATCH 08/18] TECHPUBS-4619: HPE Slingshot Network Operator docs --- operations/README.md | 1 + .../hpe_slingshot_network_operator.md | 14 ++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 operations/multi-tenancy/hpe_slingshot_network_operator.md diff --git a/operations/README.md b/operations/README.md index c40711c24bee..d451ab9dae64 100644 --- a/operations/README.md +++ b/operations/README.md @@ -835,6 +835,7 @@ these backups. - [Modifying a Tenant](multi-tenancy/Modify_a_Tenant.md) - [Removing a Tenant](multi-tenancy/Remove_a_Tenant.md) - [Slurm Operator](multi-tenancy/SlurmOperator.md) +- [HPE Slingshot Network Operator](multi-tenancy/hpe_slingshot_network_operator.md) - [Tenant and Partition Management System (TAPMS) Overview](multi-tenancy/Tapms.md) - [TAPMS Tenant Status API](../api/tapms-operator.md) - [Global Tenant Hooks](multi-tenancy/GlobalTenantHooks.md) diff --git a/operations/multi-tenancy/hpe_slingshot_network_operator.md b/operations/multi-tenancy/hpe_slingshot_network_operator.md new file mode 100644 index 000000000000..4b1564aebe4f --- /dev/null +++ b/operations/multi-tenancy/hpe_slingshot_network_operator.md @@ -0,0 +1,14 @@ +# HPE Slingshot Network Operator + +Starting in the HPE Slingshot 2.3.0 release, the HPE Slingshot Network Operator is installed as part of the Fabric Manager install. +It is a Kubernetes operator that is designed to support multi-tenancy in CSM 1.6 and later releases. + +For more information on the HPE Slingshot Network Operator, see the "HPE Slingshot Network Operator for CSM Multi-Tenancy" section in the _HPE Slingshot Administration Guide_. + +The HPE Slingshot documentation outlines several critical tasks, including: + +- Enabling the HPE Slingshot Network Operator +- Creating HPE Slingshot tenants +- Modifying HPE Slingshot tenants +- Updating VNI and tenant node component names (xnames) +- Removing HPE Slingshot tenants From 96e0d2a88208254815a18623be16f87997201b2c Mon Sep 17 00:00:00 2001 From: Nathan Rockershousen Date: Mon, 2 Dec 2024 13:00:43 -0600 Subject: [PATCH 09/18] TECHPUBS-4619: added link to HPESC --- operations/multi-tenancy/hpe_slingshot_network_operator.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/multi-tenancy/hpe_slingshot_network_operator.md b/operations/multi-tenancy/hpe_slingshot_network_operator.md index 4b1564aebe4f..ca0add7cf4e5 100644 --- a/operations/multi-tenancy/hpe_slingshot_network_operator.md +++ b/operations/multi-tenancy/hpe_slingshot_network_operator.md @@ -3,7 +3,7 @@ Starting in the HPE Slingshot 2.3.0 release, the HPE Slingshot Network Operator is installed as part of the Fabric Manager install. It is a Kubernetes operator that is designed to support multi-tenancy in CSM 1.6 and later releases. -For more information on the HPE Slingshot Network Operator, see the "HPE Slingshot Network Operator for CSM Multi-Tenancy" section in the _HPE Slingshot Administration Guide_. +For more information on the HPE Slingshot Network Operator, see the "HPE Slingshot Network Operator for CSM Multi-Tenancy" section in the _HPE Slingshot Administration Guide_. Search for this document on the [HPE Support Center](https://support.hpe.com/hpesc/public/home). The HPE Slingshot documentation outlines several critical tasks, including: From f158f868c9980e0613855a119f4107111a058111 Mon Sep 17 00:00:00 2001 From: David Laine <77020169+dlaine-hpe@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:08:02 -0600 Subject: [PATCH 10/18] CASMCMS-9226 - fix misspelling. (#5587) --- operations/image_management/Configure_IMS_to_Use_DKMS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/image_management/Configure_IMS_to_Use_DKMS.md b/operations/image_management/Configure_IMS_to_Use_DKMS.md index 52698a2040e3..5f6ea43cd1df 100644 --- a/operations/image_management/Configure_IMS_to_Use_DKMS.md +++ b/operations/image_management/Configure_IMS_to_Use_DKMS.md @@ -7,7 +7,7 @@ tool. This allows kernel modules to be built for the specific kernel used in the access to the running kernel that is not usually allowed by the Image Management Service (IMS). In order to safely allow the expanded access, the IMS configuration must be modified to enable the feature. -## Requirements of DMKS +## Requirements of DKMS Many DKMS build and install scripts require access to the system `/proc`, `/dev`, and `/sys` directories which allows access to running processes and system services. The IMS jobs run as an administrator user since preparing From 524e529668e6bda9a3ff28cec7915f703fc535d3 Mon Sep 17 00:00:00 2001 From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:05:06 +0000 Subject: [PATCH 11/18] CASMNET-2238 - Add switch firmware upgrade step to CSM upgrade procedure (#5577) --- .../iuf/workflows/management_rollout.md | 64 ++++++++++++------- .../update_management_network_firmware.md | 28 ++++++++ 2 files changed, 68 insertions(+), 24 deletions(-) diff --git a/operations/iuf/workflows/management_rollout.md b/operations/iuf/workflows/management_rollout.md index c341654ce199..1a1fbea33709 100644 --- a/operations/iuf/workflows/management_rollout.md +++ b/operations/iuf/workflows/management_rollout.md @@ -3,14 +3,15 @@ This section updates the software running on management NCNs. - [1. Perform Slingshot switch firmware updates](#1-perform-slingshot-switch-firmware-updates) -- [2. Update management host firmware (FAS)](#2-update-management-host-firmware-fas) -- [3. Execute the IUF `management-nodes-rollout` stage](#3-execute-the-iuf-management-nodes-rollout-stage) - - [3.1 `management-nodes-rollout` with CSM upgrade](#31-management-nodes-rollout-with-csm-upgrade) - - [3.2 `management-nodes-rollout` without CSM upgrade](#32-management-nodes-rollout-without-csm-upgrade) - - [3.3 NCN worker nodes](#33-ncn-worker-nodes) -- [4. Restart `goss-servers` on all NCNs](#4-restart-goss-servers-on-all-ncns) -- [5. Update management host Slingshot NIC firmware](#5-update-management-host-slingshot-nic-firmware) -- [6. Next steps](#6-next-steps) +- [2. Perform management network switch firmware updates](#2-perform-management-network-switch-firmware-updates) +- [3. Update management host firmware (FAS)](#3-update-management-host-firmware-fas) +- [4. Execute the IUF `management-nodes-rollout` stage](#4-execute-the-iuf-management-nodes-rollout-stage) + - [4.1 `management-nodes-rollout` with CSM upgrade](#41-management-nodes-rollout-with-csm-upgrade) + - [4.2 `management-nodes-rollout` without CSM upgrade](#42-management-nodes-rollout-without-csm-upgrade) + - [4.3 NCN worker nodes](#43-ncn-worker-nodes) +- [5. Restart `goss-servers` on all NCNs](#5-restart-goss-servers-on-all-ncns) +- [6. Update management host Slingshot NIC firmware](#6-update-management-host-slingshot-nic-firmware) +- [7. Next steps](#7-next-steps) ## 1. Perform Slingshot switch firmware updates @@ -22,7 +23,22 @@ Once this step has completed: - Slingshot switch firmware has been updated -## 2. Update management host firmware (FAS) +## 2. Perform management network switch firmware updates + +**`NOTE`** This section is optional and can be skipped or deferred unless network configuration that requires updated firmware is being applied to the system. + +Management network switch firmware is shipped in the HPC Firmware Pack (HFP) product tarball. + +Refer to [Update Management Network Firmware](../../network/management_network/firmware/update_management_network_firmware.md) for instructions on performing the switch firmware update. + +**`NOTE`** The firmware on spine, leaf, and CDU switches can be updated without disruption. Air-cooled compute nodes, their BMCs, and other air-cooled devices +such as Slingshot switches will experience a loss of connectivity while the leaf-bmc switch the device is connected to restarts. + +Once this step has been completed: + +- Management network switch firmware has been updated + +## 3. Update management host firmware (FAS) **`NOTE`** This subsection is optional and can be skipped if upgrading only CSM through IUF. @@ -32,7 +48,7 @@ Once this step has completed: - Host firmware has been updated on management nodes -## 3. Execute the IUF `management-nodes-rollout` stage +## 4. Execute the IUF `management-nodes-rollout` stage This section describes how to update software on management nodes. It describes how to test a new image and CFS configuration on a single node first to ensure they work as expected before rolling the changes out to the other management nodes. This initial test node is referred to as the "canary node". Modify the procedure as necessary to accommodate site preferences for rebuilding management nodes. The images and CFS configurations used are created by the @@ -49,10 +65,10 @@ being upgraded, then NCN storage nodes and NCN master nodes will not be upgraded upgraded, the NCN storage nodes and NCN master nodes will be upgraded with new images and the new CFS configuration. Both procedures use the same steps for rebuilding/upgrading NCN worker nodes. Select **one** of the following procedures based on whether or not CSM is being upgraded: -- [`management-nodes-rollout` with CSM upgrade](#31-management-nodes-rollout-with-csm-upgrade) -- [`management-nodes-rollout` without CSM upgrade](#32-management-nodes-rollout-without-csm-upgrade) +- [`management-nodes-rollout` with CSM upgrade](#41-management-nodes-rollout-with-csm-upgrade) +- [`management-nodes-rollout` without CSM upgrade](#42-management-nodes-rollout-without-csm-upgrade) -### 3.1 `management-nodes-rollout` with CSM upgrade +### 4.1 `management-nodes-rollout` with CSM upgrade All management nodes will be upgraded to a new image because CSM itself is being upgraded. This section describes how to test a new image and CFS configuration on a single canary node first before rolling it out to the other management nodes of the same management type. @@ -152,7 +168,7 @@ Refer to that table and any corresponding product documents before continuing to cray cfs components describe "${XNAME}" ``` -1. Perform the NCN worker node upgrade. To upgrade worker nodes, follow the procedure in section [3.3 NCN worker nodes](#33-ncn-worker-nodes) and then return to this procedure to complete the next step. +1. Perform the NCN worker node upgrade. To upgrade worker nodes, follow the procedure in section [4.3 NCN worker nodes](#43-ncn-worker-nodes) and then return to this procedure to complete the next step. 1. Perform the NCN master node upgrade of `ncn-m001`. @@ -200,9 +216,9 @@ Refer to that table and any corresponding product documents before continuing to - All management NCNs have been upgraded to the image and CFS configuration created in the previous steps of this workflow - Per-stage product hooks have executed for the `management-nodes-rollout` stage -Continue to the next section [4. Restart `goss-servers` on all NCNs](#4-restart-goss-servers-on-all-ncns). +Continue to the next section [5. Restart `goss-servers` on all NCNs](#5-restart-goss-servers-on-all-ncns). -### 3.2 `management-nodes-rollout` without CSM upgrade +### 4.2 `management-nodes-rollout` without CSM upgrade This is the procedure to rollout management nodes if CSM is not being upgraded. NCN worker node images contain kernel module content from non-CSM products and need to be rebuilt as part of the workflow. Unlike NCN worker nodes, NCN master nodes and storage nodes do not contain kernel module content from non-CSM products. However, user-space non-CSM product content is still provided on NCN master nodes and storage nodes and thus the `prepare-images` and `update-cfs-config` @@ -215,7 +231,7 @@ Follow the following steps to complete the `management-nodes-rollout` stage. section of the _HPE Cray EX System Software Stack Installation and Upgrade Guide for CSM (S-8052)_ provides a table that summarizes which product documents contain information or actions for the `management-nodes-rollout` stage. Refer to that table and any corresponding product documents before continuing to the next step. -1. Rebuild the NCN worker nodes. Follow the procedure in section [3.3 NCN worker nodes](#33-ncn-worker-nodes) and then return to this procedure to complete the next step. +1. Rebuild the NCN worker nodes. Follow the procedure in section [4.3 NCN worker nodes](#43-ncn-worker-nodes) and then return to this procedure to complete the next step. 1. Configure NCN master nodes. @@ -339,9 +355,9 @@ Once this step has completed: - Management NCN storage and NCN master nodes have be updated with the CFS configuration created in the previous steps of this workflow. - Per-stage product hooks have executed for the `management-nodes-rollout` stage -Continue to the next section [4. Restart `goss-servers` on all NCNs](#4-restart-goss-servers-on-all-ncns). +Continue to the next section [5. Restart `goss-servers` on all NCNs](#5-restart-goss-servers-on-all-ncns). -### 3.3 NCN worker nodes +### 4.3 NCN worker nodes NCN worker node images contain kernel module content from non-CSM products and need to be rebuilt as part of the workflow. This section describes how to test a new image and CFS configuration on a single canary node (`ncn-w001`) first before rolling it out to the other NCN worker nodes. Modify the procedure as necessary to accommodate site preferences for rebuilding NCN worker nodes. @@ -432,10 +448,10 @@ Once this step has completed: - Management NCN worker nodes have been rebuilt with the image and CFS configuration created in previous steps of this workflow - Per-stage product hooks have executed for the `management-nodes-rollout` stage -Return to the procedure that was being followed for `management-nodes-rollout` to complete the next step, either [Management-nodes-rollout with CSM upgrade](#31-management-nodes-rollout-with-csm-upgrade) or -[Management-nodes-rollout without CSM upgrade](#32-management-nodes-rollout-without-csm-upgrade). +Return to the procedure that was being followed for `management-nodes-rollout` to complete the next step, either [Management-nodes-rollout with CSM upgrade](#41-management-nodes-rollout-with-csm-upgrade) or +[Management-nodes-rollout without CSM upgrade](#42-management-nodes-rollout-without-csm-upgrade). -## 4. Restart `goss-servers` on all NCNs +## 5. Restart `goss-servers` on all NCNs **`NOTE`** Skip this step if the CSM version is 1.6.1 or above. This step will cause no harm if done on CSM 1.6.1 or higher, but it is unnecessary. @@ -449,7 +465,7 @@ ncn_nodes=${ncn_nodes%,} pdsh -S -b -w $ncn_nodes 'systemctl restart goss-servers' ``` -## 5. Update management host Slingshot NIC firmware +## 6. Update management host Slingshot NIC firmware **`NOTE`** This subsection is optional and can be skipped if upgrading only CSM through IUF. @@ -464,7 +480,7 @@ Once this step has completed: - Service checks have been run to verify product microservices are executing as expected - Per-stage product hooks have executed for the `deploy-product` and `post-install-service-check` stages -## 6. Next steps +## 7. Next steps - If performing an initial install or an upgrade of non-CSM products only, return to the [Install or upgrade additional products with IUF](install_or_upgrade_additional_products_with_iuf.md) diff --git a/operations/network/management_network/firmware/update_management_network_firmware.md b/operations/network/management_network/firmware/update_management_network_firmware.md index c1e6de483304..d5ed7a329714 100644 --- a/operations/network/management_network/firmware/update_management_network_firmware.md +++ b/operations/network/management_network/firmware/update_management_network_firmware.md @@ -25,6 +25,34 @@ Dell and Mellanox firmware must be downloaded from the manufacturer. | Dell S4148T-ON Switch Series | `10.5.1.4` | | Dell S4148F-ON Switch Series | `10.5.1.4` | +## Verify current switch firmware levels + +The CANU utility can be used to report the current management switch firmware levels. The example output shows that the firmware +is not at the recommended version and needs to be updated. + +(`ncn-m#`) Run `canu` to report the firmware level of all switches. The switch admin user password should be supplied when prompted. + +```bash +canu report network firmware --csm 1.6 --ips $(awk '/sw-/{ printf "%s%s", sep, $1; sep="," }' /etc/hosts) +``` + +Example output: + +```text +------------------------------------------------------------------ + STATUS IP HOSTNAME FIRMWARE +------------------------------------------------------------------ + ❌ Fail 10.254.0.2 sw-spine-001 LL.10.11.1010 Firmware should be in range ['LL.10.13.1040'] + ❌ Fail 10.254.0.3 sw-spine-002 LL.10.11.1010 Firmware should be in range ['LL.10.13.1040'] + ❌ Fail 10.254.0.4 sw-leaf-bmc-001 FL.10.11.1010 Firmware should be in range ['FL.10.13.1040'] + +Summary +------------------------------------------------------------------ +❌ Fail - 3 switches +LL.10.11.1010 - 2 switches +FL.10.11.1010 - 1 switches +``` + ## Aruba Firmware Best Practices Aruba software version number explained: From 315e53689ece194bec389d433215d3798a15b918 Mon Sep 17 00:00:00 2001 From: Don Bahls <114519367+don-bahls-hpe@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:06:10 -0600 Subject: [PATCH 12/18] SSI-14310 Update docs to reflect that USS now contains multiple component areas (#5581) * Update docs to reflect that USS now contains multiple component areas * Update style issues * More style cleanup * More style cleanup * Update operations/iuf/workflows/configuration.md Co-authored-by: Dean Roe Signed-off-by: Don Bahls <114519367+don-bahls-hpe@users.noreply.github.com> * Remove references to UAS service config for WLM since UAS has been removed --------- Signed-off-by: Don Bahls <114519367+don-bahls-hpe@users.noreply.github.com> Co-authored-by: Dean Roe --- operations/iuf/workflows/configuration.md | 37 +++++++++++------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/operations/iuf/workflows/configuration.md b/operations/iuf/workflows/configuration.md index 7f9f24e28b25..c9c1dc68c93e 100644 --- a/operations/iuf/workflows/configuration.md +++ b/operations/iuf/workflows/configuration.md @@ -91,14 +91,23 @@ The following highlights some of the areas that require manual configuration cha required for initial installation scenarios. - USS - - Configure DVS and LNet with appropriate Slingshot settings - - Configure DVS and LNet for use on application nodes - - Enable site-specific file system mounts - - Set the USS root password in HashiCorp Vault -- UAN - - Enable CAN, LDAP, and set MOTD - - Move DVS and LNet settings to USS branch - - Set the UAN root password in HashiCorp Vault + - Compute Configuration + - Configure DVS and LNet with appropriate Slingshot settings + - Configure DVS and LNet for use on application nodes + - Enable site-specific file system mounts + - Set the USS root password in HashiCorp Vault + - UAN Configuration + - Enable CAN, LDAP, and set MOTD + - Move DVS and LNet settings to USS branch + - Set the UAN root password in HashiCorp Vault + - Enable UAIs on UAN + - SLURM Configuration + - CSM Diags + - Update CSM Diags network attachment definition + - PBS Pro Configuration + - CSM Diags + - Update CSM Diags network attachment definition + - SHS - Update release information in `group_vars` (done for each product release) - CPE @@ -109,18 +118,6 @@ required for initial installation scenarios. - Configure SAT authentication via `sat auth` - Generate SAT S3 credentials - Configure system revision information via `sat setrev` -- SLURM - - UAS - - Configure UAS network settings - - The network settings for UAS must match the SLURM WLM to allow job submission from UAIs - - CSM Diags - - Update CSM Diags network attachment definition -- PBS Pro - - UAS - - Configure UAS network settings - - The network settings for UAS must match the PBS Pro WLM to allow job submission from UAIs - - CSM Diags - - Update CSM Diags network attachment definition Once this step has completed: From f4cd39c80beb782969491c9f0773d0cd66b8d9a0 Mon Sep 17 00:00:00 2001 From: studenym-hpe Date: Tue, 17 Dec 2024 14:04:27 -0600 Subject: [PATCH 13/18] CASMTRIAGE-7607: Grab docker-kubectl image from cacheImages for nexus upgrade (#5598) CASMTRIAGE-7607: Grab docker-kubectl image from cacheImages for nexus upgrade. --- upgrade/scripts/upgrade/prerequisites.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index 2de2132f9644..fad9131115f4 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -532,7 +532,8 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then # Skopeo image is stored as "skopeo:csm-${CSM_RELEASE}", which may resolve to docker.io/lirary/skopeo or quay.io/skopeo, depending on configured shortcuts SKOPEO_IMAGE=$(podman load -q -i "${CSM_ARTI_DIR}/vendor/skopeo.tar" 2> /dev/null | sed -e 's/^.*: //') - nexus_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | contains("nexus"))') + # Grab nexus and docker-kubectl images from cacheImages list, remove duplicates. + nexus_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | contains("nexus", "docker-kubectl"))' | sort | uniq) worker_nodes=$(grep -oP "(ncn-w\d+)" /etc/hosts | sort -u) while read -r nexus_image; do echo "Uploading $nexus_image into Nexus ..." @@ -605,8 +606,9 @@ state_name="PRECACHE_ISTIO_IMAGES" state_recorded=$(is_state_recorded "${state_name}" "$(hostname)") if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then echo "====> ${state_name} ..." | tee -a "${LOG_FILE}" + # Grab istio and docker-kubectl images from cacheImages list, remove duplicates. { - istio_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | (contains("istio") or contains("docker-kubectl")))') + istio_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | (contains("istio", "docker-kubectl")))' | sort | uniq) worker_nodes=$(grep -oP "(ncn-w\d+)" /etc/hosts | sort -u) while read -r istio_image; do while read -r worker_node; do From 237547a53452766122c055b0ea459c5f3c5e333a Mon Sep 17 00:00:00 2001 From: studenym-hpe Date: Tue, 17 Dec 2024 14:06:07 -0600 Subject: [PATCH 14/18] CASMTRIAGE-7504: Fix kubectl examples to use cray-console-data-postgres-0 (#5602) CASMTRIAGE-7504: Fix kubectl examples to use cray-console-data-postgres-0 rather than keycload-postgres-0 --- operations/kubernetes/Troubleshoot_Postgres_Database.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/kubernetes/Troubleshoot_Postgres_Database.md b/operations/kubernetes/Troubleshoot_Postgres_Database.md index 9b94bc5e5b33..eac157145b07 100644 --- a/operations/kubernetes/Troubleshoot_Postgres_Database.md +++ b/operations/kubernetes/Troubleshoot_Postgres_Database.md @@ -306,7 +306,7 @@ For example: Re-run the following command until it succeeds and reports that the leader pod is `running`. ```bash - kubectl exec keycloak-postgres-0 -c postgres -n services -it -- patronictl list + kubectl exec cray-console-data-postgres-0 -c postgres -n services -it -- patronictl list ``` Example output: @@ -328,7 +328,7 @@ For example: 1. (`ncn-mw#`) Determine which pods are reporting lag. ```bash - kubectl exec cray-console-postgres-0 -c postgres -n services -it -- patronictl list + kubectl exec cray-console-data-postgres-0 -c postgres -n services -it -- patronictl list ``` Example output: @@ -352,7 +352,7 @@ For example: 1. (`ncn-mw#`) Once the pods restart, verify that the lag has resolved. ```bash - kubectl exec cray-console-postgres-0 -c postgres -n services -it -- patronictl list + kubectl exec cray-console-data-postgres-0 -c postgres -n services -it -- patronictl list ``` Example output: From ffc55b8e18d4fe39a9d6b72f077e054cdc42a491 Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:07:08 -0600 Subject: [PATCH 15/18] CASMTRIAGE-7616 make certmanager upgrade more robust, redo upgrade if issuers chart deploy fails (#5603) CASMTRIAGE-7616 make certmanager upgrade more robust, should redo upgrade if issuers chart deploy fails --- upgrade/scripts/upgrade/prerequisites.sh | 32 ++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index fad9131115f4..4083b5e4d33c 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -732,6 +732,14 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then fi fi + # check if the cray-certmanager-issuers chart failed to deploy + # this will be entered if the certmanager upgrade failed on or before + # the certmanager-issuer chart install + if ! helm history -n cert-manager cray-certmanager-issuers > /dev/null 2>&1; then + printf "note: no helm install exists for cert-manager-issuers. Cert-manager upgrade is needed to install cert-manager-issuers\n" + ((needs_upgrade += 1)) + fi + # cert-manager will need to be upgraded if cray-drydock version is less than 2.18.4. # This will only be the case in some CSM 1.6 to CSM 1.6 upgrades. # It only needs to be checked if cert-manager is not already being upgraded. @@ -759,13 +767,13 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then fi fi + # make this name unique for CSM 1.6 in case CSM 1.5 secret still exists + backup_secret="cm-restore-data-16" + # Only run if we need to and detected not 1.12.9 or "" if [ "${needs_upgrade}" -gt 0 ]; then cmns="cert-manager" - # make this name unique for CSM 1.6 in case CSM 1.5 secret still exists - backup_secret="cm-restore-data-16" - # We need to backup before any helm uninstalls. needs_backup=0 @@ -884,9 +892,23 @@ EOF # The warning statement above needs to stay a warning. It does not exit 0 because Issuers should already exist. # 5 is an arbitrary number, expect ~21 certificates if [[ $(kubectl get certificates -A | wc -l) -lt 5 ]]; then - echo "ERROR: certificates were not restored after certmanager upgrade. 'kubectl get certificates -A' does not show certificates." + echo "WARNING: certificates were not restored after certmanager upgrade. 'kubectl get certificates -A' does not show certificates." echo "Certificates should have been restored from backup: 'kubectl get secret ${backup_secret?}'" - exit 1 + if helm history -n cert-manager cray-certmanager-issuers > /dev/null 2>&1 && helm history -n cert-manager cray-certmanager > /dev/null 2>&1; then + echo "cray-certmanager and cray-certmanager-issuers have been installed. Attempting to restore cert-manager backup" + if kubectl get secret "${backup_secret?}" > /dev/null 2>&1; then + kubectl get secret "${backup_secret?}" -o jsonpath='{.data.data}' | base64 -d | kubectl apply -f - + fi + if [[ $(kubectl get certificates -A | wc -l) -lt 5 ]]; then + echo "ERROR: certificates failed to restore. 'kubectl get certificates -A' does not show certificates." + exit 1 + else + echo "Certificates were successfully restored" + fi + else + echo "ERROR: cray-certmanager and/or cray-certmanager-issers charts failed to deploy" + exit 1 + fi fi # delete CSM 1.5 cert-manager backup if it exists backup_secret_csm_15="cm-restore-data" From 9c15d641075fd91ed88bdeaefaf1f6d04a264271 Mon Sep 17 00:00:00 2001 From: Jason Davis <106175251+jpdavis-prof@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:09:00 -0700 Subject: [PATCH 16/18] CASMINST-7039 cleanup old images and overlayFS on upgrade (#5605) CASMINST-7039 --- .../scripts/upgrade/ncn-upgrade-ceph-nodes.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/upgrade/scripts/upgrade/ncn-upgrade-ceph-nodes.sh b/upgrade/scripts/upgrade/ncn-upgrade-ceph-nodes.sh index 4b6f28ef3941..042e2324cd70 100755 --- a/upgrade/scripts/upgrade/ncn-upgrade-ceph-nodes.sh +++ b/upgrade/scripts/upgrade/ncn-upgrade-ceph-nodes.sh @@ -81,6 +81,22 @@ else echo "====> ${state_name} has been completed" fi +state_name="CLEANUP_LIVE_IMAGES" +state_recorded=$(is_state_recorded "${state_name}" ${target_ncn}) +if [[ $state_recorded == "0" ]]; then + echo "====> ${state_name} ..." + { + if [[ $ssh_keys_done == "0" ]]; then + ssh_keygen_keyscan "${target_ncn}" + ssh_keys_done=1 + fi + ssh ${target_ncn} "/srv/cray/scripts/metal/cleanup-live-images.sh -y" + } >> ${LOG_FILE} 2>&1 + record_state "${state_name}" ${target_ncn} +else + echo "====> ${state_name} has been completed" +fi + ${basedir}/../common/ncn-rebuild-common.sh $target_ncn state_name="INSTALL_TARGET_SCRIPT" From b0ebab68ed6988135c1c3031216184bb67fbcd73 Mon Sep 17 00:00:00 2001 From: Nick Davidson <86747615+ndavidson-hpe@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:22:02 -0700 Subject: [PATCH 17/18] CASMTRIAGE-7577: Update to add cray-spire-jwks (#5593) * CASMTRIAGE-7577: Update to add cray-spire-jwks * Update example to better reflect real life --- ..._On_and_Start_the_Management_Kubernetes_Cluster.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/operations/power_management/Power_On_and_Start_the_Management_Kubernetes_Cluster.md b/operations/power_management/Power_On_and_Start_the_Management_Kubernetes_Cluster.md index cd2c74b64a35..44ed1b4efbca 100644 --- a/operations/power_management/Power_On_and_Start_the_Management_Kubernetes_Cluster.md +++ b/operations/power_management/Power_On_and_Start_the_Management_Kubernetes_Cluster.md @@ -330,7 +330,7 @@ Some systems are configured with lazy mounts that do not have this requirement f To resolve the space issue, see [Troubleshoot Ceph OSDs Reporting Full](../utility_storage/Troubleshoot_Ceph_OSDs_Reporting_Full.md). -1. (`ncn-m001#`) Check that `spire` pods have started. +1. (`ncn-m001#`) Check that `spire` and `cray-spire` pods have started. Monitor the status of the `spire-jwks` pods to ensure they restart and enter the `Running` state. @@ -341,6 +341,9 @@ Some systems are configured with lazy mounts that do not have this requirement f Example output: ```text + cray-spire-jwks-57bbb4f5c7-57j5k 2/3 CrashLoopBackOff 9 23h 10.44.0.31 ncn-w002 + cray-spire-jwks-57bbb4f5c7-crb2m 2/3 CrashLoopBackOff 9 23h 10.36.0.34 ncn-w003 + cray-spire-jwks-57bbb4f5c7-lq9ar 2/3 CrashLoopBackOff 9 23h 10.39.0.5 ncn-w001 spire-jwks-6b97457548-gc7td 2/3 CrashLoopBackOff 9 23h 10.44.0.117 ncn-w002 spire-jwks-6b97457548-jd7bd 2/3 CrashLoopBackOff 9 23h 10.36.0.123 ncn-w003 spire-jwks-6b97457548-lvqmf 2/3 CrashLoopBackOff 9 23h 10.39.0.79 ncn-w001 @@ -352,6 +355,12 @@ Some systems are configured with lazy mounts that do not have this requirement f kubectl rollout restart -n spire deployment spire-jwks ``` + 1. (`ncn-m001#`) If the `cray-spire-jwks` pods indicate `CrashLoopBackOff`, then restart the Cray Spire deployment. + + ```bash + kubectl rollout restart -n spire deployment cray-spire-jwks + ``` + 1. (`ncn-m001#`) Rejoin Spire on the worker and master NCNs, to avoid issues with Spire tokens. ```bash From b99c82670e02bb78dcd0a58eb12b7f93e8285a01 Mon Sep 17 00:00:00 2001 From: Srinivas-Anand-HPE Date: Wed, 18 Dec 2024 15:56:20 +0530 Subject: [PATCH 18/18] CASMINST-5657 Add common WorkflowTemplate to sync secret to Argo namespace --- .../cleanup-nexus-admin-credential.yaml | 57 ++++++++ .../nexus-docker-upload-template.yaml | 135 +----------------- .../nexus-get-prerequisites-template.yaml | 124 ++++++++++++++++ .../nexus-helm-upload-template.yaml | 135 +----------------- .../nexus-rpm-upload-template.yaml | 133 +---------------- .../nexus-setup/nexus-setup-template.yaml | 130 +---------------- 6 files changed, 208 insertions(+), 506 deletions(-) create mode 100644 workflows/iuf/operations/nexus-setup/cleanup-nexus-admin-credential.yaml create mode 100644 workflows/iuf/operations/nexus-setup/nexus-get-prerequisites-template.yaml diff --git a/workflows/iuf/operations/nexus-setup/cleanup-nexus-admin-credential.yaml b/workflows/iuf/operations/nexus-setup/cleanup-nexus-admin-credential.yaml new file mode 100644 index 000000000000..49109ee825c1 --- /dev/null +++ b/workflows/iuf/operations/nexus-setup/cleanup-nexus-admin-credential.yaml @@ -0,0 +1,57 @@ +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: cleanup-nexus-admin-credential-template + namespace: argo + annotations: + sidecar.istio.io/inject: "false" +spec: + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + entrypoint: cleanup-nexus-admin-credential + arguments: + parameters: + - name: nexus_admin_credential_secret_name + templates: + - name: cleanup-nexus-admin-credential + inputs: + parameters: + - name: nexus_admin_credential_secret_name + script: + image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 + command: [bash] + source: | + nexus_secret_name={{inputs.parameters.nexus_admin_credential_secret_name}} + echo "DEBUG Deleting secret $nexus_secret_name from argo workspace" + result=$(kubectl -n argo delete secret/$nexus_secret_name 2>&1) + if [ $? -ne 0 ]; then + result=$(echo "$result" | sed -e 's/^/DEBUG /') + echo "ERROR Deleting secret $nexus_secret_name failed in argo namespace" + echo -e "DEBUG failed with\n\n$result" + fi diff --git a/workflows/iuf/operations/nexus-setup/nexus-docker-upload-template.yaml b/workflows/iuf/operations/nexus-setup/nexus-docker-upload-template.yaml index f91f48ac7f4d..7747e6ae6286 100644 --- a/workflows/iuf/operations/nexus-setup/nexus-docker-upload-template.yaml +++ b/workflows/iuf/operations/nexus-setup/nexus-docker-upload-template.yaml @@ -60,7 +60,9 @@ spec: name: workflow-template-record-time-template template: record-time-template - - name: nexus-get-prerequisites - template: nexus-get-prerequisites-template + templateRef: + name: nexus-get-prerequisites-template + template: nexus-get-prerequisites arguments: parameters: - name: global_params @@ -68,8 +70,10 @@ spec: - - name: nexus-docker-load template: nexus-docker-load-template hooks: - exit: - template: cleanup-template + exit: + templateRef: + name: cleanup-nexus-admin-credential-template + template: cleanup-nexus-admin-credential arguments: parameters: - name: nexus_admin_credential_secret_name @@ -152,103 +156,6 @@ spec: command: [sh, -c] args: ["DIFF_TIME=$(expr {{inputs.parameters.opend}} - {{inputs.parameters.opstart}}); echo $DIFF_TIME; echo $DIFF_TIME > /tmp/diff_time.txt"] ### Templates ### -## nexus-get-prerequisites-template ## - - name: nexus-get-prerequisites-template - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - inputs: - parameters: - - name: global_params - value: "{{workflow.parameters.global_params}}" - outputs: - parameters: - - name: secret_name - valueFrom: - path: /tmp/secret_name - - name: current_product_manifest - valueFrom: - path: /tmp/current_product_manifest - - name: product_directory - valueFrom: - path: /tmp/product_directory - retryStrategy: - limit: "2" - retryPolicy: "Always" - backoff: - duration: "10s" # Must be a string. Default unit is seconds. Could also be a Duration, e.g.: "2m", "6h", "1d" - factor: "2" - maxDuration: "1m" - script: - # TBD: This is a repeated function. Can this change to a reference? - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - function sync_item() { - item_name="$1" - source_ns="$2" - destination_name="$3-$RANDOM" - destination_ns="$4" - result=$(kubectl get $item_name -n $source_ns 2>&1) - if [ $? -eq 0 ]; then - echo "DEBUG Syncing $item_name from $source_ns to $destination_ns as $destination_name" - kubectl get $item_name -n $source_ns -o json | \ - jq 'del(.metadata.namespace)' | \ - jq 'del(.metadata.creationTimestamp)' | \ - jq 'del(.metadata.resourceVersion)' | \ - jq 'del(.metadata.selfLink)' | \ - jq 'del(.metadata.uid)' | \ - jq 'del(.metadata.ownerReferences)' | \ - jq 'del(.metadata.name)' | \ - jq '.metadata |= . + {"name":"'$destination_name'"}' | \ - kubectl apply -n $destination_ns -f - - rc=$? - if [ $rc -ne 0 ]; then - echo "ERROR Failed to create secret $destination_name in $destination_ns namespace from $item_name in $source_ns namespace" - fi - return $rc - else - echo "ERROR $item_name not found in $source_ns namespace" - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo -e "DEBUG failed with\n\n$result" - return 1 - fi - } - err=0 - sync_item secret/nexus-admin-credential nexus nexus-admin-credential-argo argo - if [ $? -ne 0 ]; then - err=1 - fi - echo $destination_name > /tmp/secret_name - - # Retrieve the content of the current product manifest as json. - product_name={{inputs.parameters.global_params}} | jq -r '.product_manifest.current_product.name' - echo "DEBUG Fetching the product manifest for $product_name" - cat < /tmp/current_product_manifest - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - err=1 - echo "ERROR Failed to fetch product manifest for $product_name. Rerun with valid product tarball from 'process-media' stage" - fi - - # Retrieve the product directory. - echo "DEBUG Retrieving the product directory for $product_name" - cat < /tmp/product_directory - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - echo "ERROR Failed to fetch product directory for $product_name. Rerun with valid product tarball from 'process-media' stage" - err=1 - fi - - # Exit with an error if we had any. - exit $err ## nexus-docker-load-template ## - name: nexus-docker-load-template inputs: @@ -294,31 +201,3 @@ spec: - name: image hostPath: path: "{{inputs.parameters.product_directory}}" -## cleanup-template ## -## Remove the secret created earlier. -# TBD: This is a repeated function. Can this change to a reference? - - name: cleanup-template - inputs: - parameters: - - name: nexus_admin_credential_secret_name - value: "{{steps.nexus-get-prerequisites.outputs.parameters.secret_name}}" - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - script: - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - nexus_secret_name={{inputs.parameters.nexus_admin_credential_secret_name}} - echo "DEBUG Deleting secret $nexus_secret_name from argo workspace" - result=$(kubectl -n argo delete secret/$nexus_secret_name 2>&1) - if [ $? -ne 0 ]; then - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo "ERROR Deleting secret $nexus_secret_name failed in argo namespace" - echo -e "DEBUG failed with\n\n$result" - fi diff --git a/workflows/iuf/operations/nexus-setup/nexus-get-prerequisites-template.yaml b/workflows/iuf/operations/nexus-setup/nexus-get-prerequisites-template.yaml new file mode 100644 index 000000000000..b41717226d97 --- /dev/null +++ b/workflows/iuf/operations/nexus-setup/nexus-get-prerequisites-template.yaml @@ -0,0 +1,124 @@ +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: nexus-get-prerequisites-template + namespace: argo + annotations: + sidecar.istio.io/inject: "false" +spec: + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + entrypoint: nexus-get-prerequisites + templates: + - name: nexus-get-prerequisites + inputs: + parameters: + - name: global_params + outputs: + parameters: + - name: secret_name + valueFrom: + path: /tmp/secret_name + - name: current_product_manifest + valueFrom: + path: /tmp/current_product_manifest + - name: product_directory + valueFrom: + path: /tmp/product_directory + retryStrategy: + limit: "2" + retryPolicy: "Always" + backoff: + duration: "10s" + factor: "2" + maxDuration: "1m" + script: + image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 + command: [bash] + source: | + function sync_item() { + item_name="$1" + source_ns="$2" + destination_name="$3-$RANDOM" + destination_ns="$4" + result=$(kubectl get $item_name -n $source_ns 2>&1) + if [ $? -eq 0 ]; then + echo "DEBUG Syncing $item_name from $source_ns to $destination_ns as $destination_name" + kubectl get $item_name -n $source_ns -o json | \ + jq 'del(.metadata.namespace)' | \ + jq 'del(.metadata.creationTimestamp)' | \ + jq 'del(.metadata.resourceVersion)' | \ + jq 'del(.metadata.selfLink)' | \ + jq 'del(.metadata.uid)' | \ + jq 'del(.metadata.ownerReferences)' | \ + jq 'del(.metadata.name)' | \ + jq '.metadata |= . + {"name":"'$destination_name'"}' | \ + kubectl apply -n $destination_ns -f - + rc=$? + if [ $rc -ne 0 ]; then + echo "ERROR Failed to create secret $destination_name in $destination_ns namespace from $item_name in $source_ns namespace" + fi + return $rc + else + echo "ERROR $item_name not found in $source_ns namespace" + result=$(echo "$result" | sed -e 's/^/DEBUG /') + echo -e "DEBUG failed with\n\n$result" + return 1 + fi + } + err=0 + sync_item secret/nexus-admin-credential nexus nexus-admin-credential-argo argo + if [ $? -ne 0 ]; then + err=1 + fi + echo $destination_name > /tmp/secret_name + + # Retrieve the content of the current product manifest as json. + product_name={{inputs.parameters.global_params}} | jq -r '.product_manifest.current_product.name' + echo "DEBUG Fetching the product manifest for $product_name" + cat < /tmp/current_product_manifest + {{inputs.parameters.global_params}} + EOF + if [ $? -ne 0 ]; then + err=1 + echo "ERROR Failed to fetch product manifest for $product_name. Rerun with valid product tarball from 'process-media' stage" + fi + + # Retrieve the product directory. + echo "DEBUG Retrieving the product directory for $product_name" + cat < /tmp/product_directory + {{inputs.parameters.global_params}} + EOF + if [ $? -ne 0 ]; then + echo "ERROR Failed to fetch product directory for $product_name. Rerun with valid product tarball from 'process-media' stage" + err=1 + fi + + # Exit with an error if we had any. + exit $err diff --git a/workflows/iuf/operations/nexus-setup/nexus-helm-upload-template.yaml b/workflows/iuf/operations/nexus-setup/nexus-helm-upload-template.yaml index 9d6b20555aee..5cdf5485d0cc 100644 --- a/workflows/iuf/operations/nexus-setup/nexus-helm-upload-template.yaml +++ b/workflows/iuf/operations/nexus-setup/nexus-helm-upload-template.yaml @@ -60,7 +60,9 @@ spec: name: workflow-template-record-time-template template: record-time-template - - name: nexus-get-prerequisites - template: nexus-get-prerequisites-template + templateRef: + name: nexus-get-prerequisites-template + template: nexus-get-prerequisites arguments: parameters: - name: global_params @@ -68,8 +70,10 @@ spec: - - name: nexus-helm-load template: nexus-helm-load-template hooks: - exit: - template: cleanup-template + exit: + templateRef: + name: cleanup-nexus-admin-credential-template + template: cleanup-nexus-admin-credential arguments: parameters: - name: nexus_admin_credential_secret_name @@ -151,103 +155,6 @@ spec: command: [sh, -c] args: ["DIFF_TIME=$(expr {{inputs.parameters.opend}} - {{inputs.parameters.opstart}}); echo $DIFF_TIME; echo $DIFF_TIME > /tmp/diff_time.txt"] ### Templates ### -## nexus-get-prerequisites-template ## - - name: nexus-get-prerequisites-template - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - inputs: - parameters: - - name: global_params - value: "{{workflow.parameters.global_params}}" - outputs: - parameters: - - name: secret_name - valueFrom: - path: /tmp/secret_name - - name: current_product_manifest - valueFrom: - path: /tmp/current_product_manifest - - name: product_directory - valueFrom: - path: /tmp/product_directory - retryStrategy: - limit: "2" - retryPolicy: "Always" - backoff: - duration: "10s" # Must be a string. Default unit is seconds. Could also be a Duration, e.g.: "2m", "6h", "1d" - factor: "2" - maxDuration: "1m" - script: - # TBD: This is a repeated function. Can this change to a reference? - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - function sync_item() { - item_name="$1" - source_ns="$2" - destination_name="$3-$RANDOM" - destination_ns="$4" - result=$(kubectl get $item_name -n $source_ns 2>&1) - if [ $? -eq 0 ]; then - echo "DEBUG Syncing $item_name from $source_ns to $destination_ns as $destination_name" - kubectl get $item_name -n $source_ns -o json | \ - jq 'del(.metadata.namespace)' | \ - jq 'del(.metadata.creationTimestamp)' | \ - jq 'del(.metadata.resourceVersion)' | \ - jq 'del(.metadata.selfLink)' | \ - jq 'del(.metadata.uid)' | \ - jq 'del(.metadata.ownerReferences)' | \ - jq 'del(.metadata.name)' | \ - jq '.metadata |= . + {"name":"'$destination_name'"}' | \ - kubectl apply -n $destination_ns -f - - rc=$? - if [ $rc -ne 0 ]; then - echo "ERROR Failed to create secret $destination_name in $destination_ns namespace from $item_name in $source_ns namespace" - fi - return $rc - else - echo "ERROR $item_name not found in $source_ns namespace" - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo -e "DEBUG failed with\n\n$result" - return 1 - fi - } - err=0 - sync_item secret/nexus-admin-credential nexus nexus-admin-credential-argo argo - if [ $? -ne 0 ]; then - err=1 - fi - echo $destination_name > /tmp/secret_name - - # Retrieve the content of the current product manifest as json. - product_name={{inputs.parameters.global_params}} | jq -r '.product_manifest.current_product.name' - echo "DEBUG Fetching the product manifest for $product_name" - cat < /tmp/current_product_manifest - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - err=1 - echo "ERROR Failed to fetch product manifest for $product_name. Rerun with valid product tarball from 'process-media' stage" - fi - - # Retrieve the product directory. - echo "DEBUG Retrieving the product directory for $product_name" - cat < /tmp/product_directory - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - echo "ERROR Failed to fetch product directory for $product_name. Rerun with valid product tarball from 'process-media' stage" - err=1 - fi - - # Exit with an error if we had any. - exit $err ## nexus-helm-load-template ## - name: nexus-helm-load-template inputs: @@ -295,31 +202,3 @@ spec: - name: product hostPath: path: "{{inputs.parameters.product_directory}}" -## cleanup-template ## -## Remove the secret created earlier. -# TBD: This is a repeated function. Can this change to a reference? - - name: cleanup-template - inputs: - parameters: - - name: nexus_admin_credential_secret_name - value: "{{steps.nexus-get-prerequisites.outputs.parameters.secret_name}}" - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - script: - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - nexus_secret_name={{inputs.parameters.nexus_admin_credential_secret_name}} - echo "DEBUG Deleting secret $nexus_secret_name from argo workspace" - result=$(kubectl -n argo delete secret/$nexus_secret_name 2>&1) - if [ $? -ne 0 ]; then - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo "ERROR Deleting secret $nexus_secret_name failed in argo namespace" - echo -e "DEBUG failed with\n\n$result" - fi diff --git a/workflows/iuf/operations/nexus-setup/nexus-rpm-upload-template.yaml b/workflows/iuf/operations/nexus-setup/nexus-rpm-upload-template.yaml index 44cdc94a9023..3e857d7ab5e6 100644 --- a/workflows/iuf/operations/nexus-setup/nexus-rpm-upload-template.yaml +++ b/workflows/iuf/operations/nexus-setup/nexus-rpm-upload-template.yaml @@ -60,7 +60,9 @@ spec: name: workflow-template-record-time-template template: record-time-template - - name: nexus-get-prerequisites - template: nexus-get-prerequisites-template + templateRef: + name: nexus-get-prerequisites-template + template: nexus-get-prerequisites arguments: parameters: - name: global_params @@ -68,8 +70,10 @@ spec: - - name: nexus-rpm-load template: nexus-rpm-load-template hooks: - exit: - template: cleanup-template + exit: + templateRef: + name: cleanup-nexus-admin-credential-template + template: cleanup-nexus-admin-credential arguments: parameters: - name: nexus_admin_credential_secret_name @@ -140,102 +144,6 @@ spec: args: ["DIFF_TIME=$(expr {{inputs.parameters.opend}} - {{inputs.parameters.opstart}}); echo $DIFF_TIME; echo $DIFF_TIME > /tmp/diff_time.txt"] ### Templates ### -## nexus-get-prerequisites-template ## - - name: nexus-get-prerequisites-template - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - inputs: - parameters: - - name: global_params - outputs: - parameters: - - name: secret_name - valueFrom: - path: /tmp/secret_name - - name: current_product_manifest - valueFrom: - path: /tmp/current_product_manifest - - name: product_directory - valueFrom: - path: /tmp/product_directory - retryStrategy: - limit: "2" - retryPolicy: "Always" - backoff: - duration: "10s" # Must be a string. Default unit is seconds. Could also be a Duration, e.g.: "2m", "6h", "1d" - factor: "2" - maxDuration: "1m" - script: - # TBD: This is a repeated function. Can this change to a reference? - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - function sync_item() { - item_name="$1" - source_ns="$2" - destination_name="$3-$RANDOM" - destination_ns="$4" - result=$(kubectl get $item_name -n $source_ns 2>&1) - if [ $? -eq 0 ]; then - echo "DEBUG Syncing $item_name from $source_ns to $destination_ns as $destination_name" - kubectl get $item_name -n $source_ns -o json | \ - jq 'del(.metadata.namespace)' | \ - jq 'del(.metadata.creationTimestamp)' | \ - jq 'del(.metadata.resourceVersion)' | \ - jq 'del(.metadata.selfLink)' | \ - jq 'del(.metadata.uid)' | \ - jq 'del(.metadata.ownerReferences)' | \ - jq 'del(.metadata.name)' | \ - jq '.metadata |= . + {"name":"'$destination_name'"}' | \ - kubectl apply -n $destination_ns -f - - rc=$? - if [ $rc -ne 0 ]; then - echo "ERROR Failed to create secret $destination_name in $destination_ns namespace from $item_name in $source_ns namespace" - fi - return $rc - else - echo "ERROR $item_name not found in $source_ns namespace" - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo -e "DEBUG failed with\n\n$result" - return 1 - fi - } - err=0 - sync_item secret/nexus-admin-credential nexus nexus-admin-credential-argo argo - if [ $? -ne 0 ]; then - err=1 - fi - echo $destination_name > /tmp/secret_name - - # Retrieve the content of the current product manifest as json. - product_name={{inputs.parameters.global_params}} | jq -r '.product_manifest.current_product.name' - echo "DEBUG Fetching the product manifest for $product_name" - cat < /tmp/current_product_manifest - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - err=1 - echo "ERROR Failed to fetch product manifest for $product_name. Rerun with valid product tarball from 'process-media' stage" - fi - - # Retrieve the product directory. - echo "DEBUG Retrieving the product directory for $product_name" - cat < /tmp/product_directory - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - echo "ERROR Failed to fetch product directory for $product_name. Rerun with valid product tarball from 'process-media' stage" - err=1 - fi - - # Exit with an error if we had any. - exit $err ## nexus-rpm-load-template ## - name: nexus-rpm-load-template inputs: @@ -277,30 +185,3 @@ spec: - name: product hostPath: path: "{{inputs.parameters.product_directory}}" -## cleanup-template ## -## Remove the secret created earlier. -# TBD: This is a repeated function. Can this change to a reference? - - name: cleanup-template - inputs: - parameters: - - name: nexus_admin_credential_secret_name - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - script: - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - nexus_secret_name={{inputs.parameters.nexus_admin_credential_secret_name}} - echo "DEBUG Deleting secret $nexus_secret_name from argo workspace" - result=$(kubectl -n argo delete secret/$nexus_secret_name 2>&1) - if [ $? -ne 0 ]; then - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo "ERROR Deleting secret $nexus_secret_name failed in argo namespace" - echo -e "DEBUG failed with\n\n$result" - fi diff --git a/workflows/iuf/operations/nexus-setup/nexus-setup-template.yaml b/workflows/iuf/operations/nexus-setup/nexus-setup-template.yaml index 8c828f6598b0..5340c25d2981 100644 --- a/workflows/iuf/operations/nexus-setup/nexus-setup-template.yaml +++ b/workflows/iuf/operations/nexus-setup/nexus-setup-template.yaml @@ -60,7 +60,9 @@ spec: name: workflow-template-record-time-template template: record-time-template - - name: nexus-get-prerequisites - template: nexus-get-prerequisites-template + templateRef: + name: nexus-get-prerequisites-template + template: nexus-get-prerequisites arguments: parameters: - name: global_params @@ -69,7 +71,9 @@ spec: template: nexus-setup-template hooks: exit: - template: cleanup-template + templateRef: + name: cleanup-nexus-admin-credential-template + template: cleanup-nexus-admin-credential arguments: parameters: - name: nexus_admin_credential_secret_name @@ -153,102 +157,6 @@ spec: args: ["DIFF_TIME=$(expr {{inputs.parameters.opend}} - {{inputs.parameters.opstart}}); echo $DIFF_TIME; echo $DIFF_TIME > /tmp/diff_time.txt"] ### Templates ### -## nexus-get-prerequisites-template ## - - name: nexus-get-prerequisites-template - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - inputs: - parameters: - - name: global_params - outputs: - parameters: - - name: secret_name - valueFrom: - path: /tmp/secret_name - - name: current_product_manifest - valueFrom: - path: /tmp/current_product_manifest - - name: product_directory - valueFrom: - path: /tmp/product_directory - retryStrategy: - limit: "2" - retryPolicy: "Always" - backoff: - duration: "10s" # Must be a string. Default unit is seconds. Could also be a Duration, e.g.: "2m", "6h", "1d" - factor: "2" - maxDuration: "1m" - script: - # TBD: This is a repeated function. Can this change to a reference? - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - function sync_item() { - item_name="$1" - source_ns="$2" - destination_name="$3-$RANDOM" - destination_ns="$4" - result=$(kubectl get $item_name -n $source_ns 2>&1) - if [ $? -eq 0 ]; then - echo "DEBUG Syncing $item_name from $source_ns to $destination_ns as $destination_name" - kubectl get $item_name -n $source_ns -o json | \ - jq 'del(.metadata.namespace)' | \ - jq 'del(.metadata.creationTimestamp)' | \ - jq 'del(.metadata.resourceVersion)' | \ - jq 'del(.metadata.selfLink)' | \ - jq 'del(.metadata.uid)' | \ - jq 'del(.metadata.ownerReferences)' | \ - jq 'del(.metadata.name)' | \ - jq '.metadata |= . + {"name":"'$destination_name'"}' | \ - kubectl apply -n $destination_ns -f - - rc=$? - if [ $rc -ne 0 ]; then - echo "ERROR Failed to create secret $destination_name in $destination_ns namespace from $item_name in $source_ns namespace" - fi - return $rc - else - echo "ERROR $item_name not found in $source_ns namespace" - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo -e "DEBUG failed with\n\n$result" - return 1 - fi - } - err=0 - sync_item secret/nexus-admin-credential nexus nexus-admin-credential-argo argo - if [ $? -ne 0 ]; then - err=1 - fi - echo $destination_name > /tmp/secret_name - - # Retrieve the content of the current product manifest as json. - product_name={{inputs.parameters.global_params}} | jq -r '.product_manifest.current_product.name' - echo "DEBUG Fetching the product manifest for $product_name" - cat < /tmp/current_product_manifest - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - err=1 - echo "ERROR Failed to fetch product manifest for $product_name. Rerun with valid product tarball from 'process-media' stage" - fi - - # Retrieve the product directory. - echo "DEBUG Retrieving the product directory for $product_name" - cat < /tmp/product_directory - {{inputs.parameters.global_params}} - EOF - if [ $? -ne 0 ]; then - echo "ERROR Failed to fetch product directory for $product_name. Rerun with valid product tarball from 'process-media' stage" - err=1 - fi - - # Exit with an error if we had any. - exit $err ## nexus-setup-template ## - name: nexus-setup-template inputs: @@ -296,29 +204,3 @@ spec: - name: products hostPath: path: "{{inputs.parameters.product_directory}}" -## cleanup-template ## -## Remove the secret created earlier. - - name: cleanup-template - inputs: - parameters: - - name: nexus_admin_credential_secret_name - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - metadata: - annotations: - sidecar.istio.io/inject: "false" - script: - image: artifactory.algol60.net/csm-docker/stable/docker.io/portainer/kubectl-shell:latest-v1.21.1-amd64 - command: [bash] - source: | - nexus_secret_name={{inputs.parameters.nexus_admin_credential_secret_name}} - echo "DEBUG Deleting secret $nexus_secret_name from argo workspace" - result=$(kubectl -n argo delete secret/$nexus_secret_name 2>&1) - if [ $? -ne 0 ]; then - result=$(echo "$result" | sed -e 's/^/DEBUG /') - echo "ERROR Deleting secret $nexus_secret_name failed in argo namespace" - echo -e "DEBUG failed with\n\n$result" - fi