Skip to content

Commit

Permalink
draft
Browse files Browse the repository at this point in the history
  • Loading branch information
mharding-hpe committed Aug 29, 2024
1 parent 6aaa6ff commit a1846f9
Show file tree
Hide file tree
Showing 5 changed files with 364 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Changed
- Improve server code that validates incoming data
- Sanitize session templates during migration to this BOS version, to ensure they comply with the API specification
- Do not delete migration job after it completes; instead, set a TTL value for it, to allow time for its logs to be
collected after it completes.

### Fixed
- Added missing required Python modules to `requirements.txt`
Expand Down
18 changes: 17 additions & 1 deletion kubernetes/cray-bos/templates/post-upgrade-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ metadata:
annotations:
"helm.sh/hook": post-upgrade
"helm.sh/hook-weight": "-1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation


spec:
ttlSecondsAfterFinished: 86400
template:
metadata:
name: "{{ .Release.Name }}"
Expand Down Expand Up @@ -67,6 +67,22 @@ spec:
env:
- name: APP_VERSION
value: 0.0.0-app-version
- name: S3_CREDENTIALS
value: "bos-s3-credentials"
- name: S3_PROTOCOL
value: "https"
- name: S3_GATEWAY
value: "rgw-vip"
- name: S3_ACCESS_KEY
valueFrom:
secretKeyRef:
name: "bos-s3-credentials"
key: access_key
- name: S3_SECRET_KEY
valueFrom:
secretKeyRef:
name: "bos-s3-credentials"
key: secret_key
{{ range (index .Values "cray-service" "containers" "cray-bos" "ports") -}}
{{if eq .name "http" }}
- name: BOS_CONTAINER_PORT
Expand Down
145 changes: 145 additions & 0 deletions src/bos/server/backup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#
# MIT License
#
# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

from contextlib import contextmanager
import datetime
import hashlib
import json
import logging
import os
import tarfile
import tempfile
import time

from bos.operators.utils.clients.s3 import s3_client
import bos.server.redis_db_utils as dbutils

LOGGER = logging.getLogger('bos.server.backup')
BOS_S3_BUCKET = 'bos-data'

# https://dev.to/teckert/changing-directory-with-a-python-context-manager-2bj8
@contextmanager
def set_directory(path: str):
"""Sets the cwd within the context
Args:
path (Path): The path to the cwd
Yields:
None
"""

origin = os.getcwd()
try:
os.chdir(path)
yield
finally:
os.chdir(origin)


def remove_file(filepath):
try:
os.remove(filepath)
except OSError as exc:
LOGGER.warning("Error deleting temporary file '%s': %s", filepath, exc)


def remove_dir(dirpath):
try:
os.rmdir(dirpath)
except OSError as exc:
LOGGER.warning("Error deleting temporary directory '%s': %s", dirpath, exc)


def create_tarfile(bos_data: dict, bos_data_basename: str, tmpdir_path: str) -> str:
jsonfile_name = f"{bos_data_basename}.json"
jsonfile_path = os.path.join(tmpdir_path, jsonfile_name)
LOGGER.debug("Writing BOS data as JSON data to '%s'", jsonfile_path)
with open(jsonfile_path, "wt") as outfile:
json.dump(bos_data, outfile)
tarfile_name = f"{bos_data_basename}.tar.gz"
tarfile_path = os.path.join(tmpdir_path, tarfile_name)
LOGGER.debug("Creating compressed tar archive of BOS data: '%s'", tarfile_path)
with set_directory(tmpdir_path):
with tarfile.open(tarfile_path, mode='w:gz') as tfile:
tfile.add(jsonfile_name)
remove_file(jsonfile_path)
return tarfile_name


def md5sum(filepath: str):
"""
Utility for efficient md5sum of a file.
Borrowed from ims-python-helper
"""
hashmd5 = hashlib.md5()
with open(filepath, "rb") as afile:
for chunk in iter(lambda: afile.read(4096), b""):
hashmd5.update(chunk)
return hashmd5.hexdigest()


def upload_to_s3(filename: str, filedir: str) -> None:
s3_key = filename
s3_path = f"s3://{BOS_S3_BUCKET}/{s3_key}"
filepath = os.path.join(filedir, filename)
extra_args = {'Metadata': {'md5sum': md5sum(filepath)}}
attempt=0
client = s3_client()
LOGGER.info("Creating/retrieving S3 bucket '%s'", BOS_S3_BUCKET)
try:
response = client.create_bucket(ACL='authenticated-read', Bucket=BOS_S3_BUCKET)
except Exception as err: # pylint: disable=bare-except, broad-except
LOGGER.error("Error creating S3 bucket '%s': %s", BOS_S3_BUCKET, err)
raise
LOGGER.debug("create_bucket response = %s", response)
LOGGER.info("Uploading %s", s3_path)
while True:
attempt+=1
try:
client.upload_file(filepath, BOS_S3_BUCKET, s3_key, ExtraArgs=extra_args)
break
except Exception as err: # pylint: disable=bare-except, broad-except
if attempt <= 60:
LOGGER.warning("Error uploading %s: %s; Re-attempting in %s seconds...",
s3_path, err, attempt)
time.sleep(attempt)
continue
LOGGER.error("Error uploading %s: %s; Giving up", s3_path, err)
raise
LOGGER.debug("%s upload completed successfully", s3_path)


def backup_bos_data(label: str):
LOGGER.info("Performing backup of BOS data (context: %s)", label)
bos_data = { dblabel: dbutils.get_wrapper(db=dblabel).get_all_as_dict()
for dblabel in dbutils.DATABASES }
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S.%f")
bos_data_basename = f"bos_data_{label}_{timestamp}"
tmpdir_path = tempfile.mkdtemp(prefix=f"{bos_data_basename}-", dir="/tmp")
tarfile_name = create_tarfile(bos_data, bos_data_basename, tmpdir_path)
upload_to_s3(tarfile_name, tmpdir_path)
remove_file(os.path.join(tmpdir_path, tarfile_name))
remove_dir(tmpdir_path)
LOGGER.debug("BOS data backup completed (context: %s)", label)
183 changes: 179 additions & 4 deletions src/bos/server/migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,192 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

import copy
import logging
import os
import pkgutil

from bos.common.tenant_utils import get_tenant_aware_key
from bos.common.utils import exc_type_msg
from bos.server.backup import backup_bos_data
from bos.server.controllers.v2.components import del_timestamp, get_v2_components_data
import bos.server.controllers.v2.options as options
from bos.server.controllers.v2.sessiontemplates import validate_sanitize_session_template
import bos.server.redis_db_utils as dbutils
from bos.server.schema import validator
from bos.server.utils import ParsingException

LOGGER = logging.getLogger('bos.server.migration')


def create_sanitized_session_template(data):
"""
If there are any problems with the template, raise an exception
Otherwise, return a tuple of the session template DB key and the session template itself
I cannot envision a scenario where we end up with a session template in the database where
the key doesn't line up with the name and tenant, but it does no harm to make sure
"""
try:
st_name = data["name"]
except KeyError as exc:
# In the unlikely event that this template has no name field, we don't want to keep it
raise ParsingException("Missing required 'name' field") from exc

new_data = copy.deepcopy(data)
try:
# Validate that the session template follows the API schema, and sanitize it
validate_sanitize_session_template(st_name, new_data)
validator.validate_session_template(new_data)
except Exception as exc:
raise ParsingException(f"Validation failure: {exc_type_msg(exc)}") from exc

new_key = get_tenant_aware_key(st_name, new_data.get("tenant", None)).encode()
return new_key, new_data


def sanitize_session_templates():
LOGGER.info("Sanitizing session templates")
db=dbutils.get_wrapper(db='session_templates')
response = db.get_keys()
for st_key in response:
data = db.get(st_key)
try:
new_key, new_data = create_sanitized_session_template(data)
except ParsingException as exc:
LOGGER.warning("Deleting session template (reason: %s): %s", exc, data)
#db.delete(st_key)
continue

if new_key == st_key and new_data == data:
# No changes for this template
continue

# Either the key has changed, the data has changed, or both

# If the template data has been modified but not the key, just update it
if new_key == st_key:
LOGGER.warning("Modifying session template. Before: %s After: %s", data, new_data)
#db.put(st_key, new_data)
continue

# This means that the DB key changed.
# I don't anticipate this happening, but better to be sure that our keys are correct,
# since in the past our patching code did not have a lot of safeguards.
# Essentially this means that the name and/or tenant inside the template record generate
# a hash key that does not match the one we are using. This is not good.
LOGGER.warning("Deleting session template. Reason: db key should be '%s' but actually is "
"'%s'. Template = %s", new_key, st_key, data)
#db.delete(st_key)
LOGGER.info("Done sanitizing session templates")


def sanitize_options():
LOGGER.info("Sanitizing options")
db=dbutils.get_wrapper(db='options')
options_data = db.get(options.OPTIONS_KEY)
new_options_data = {}
for opt_name, opt_value in options_data.items():
if opt_name not in options.DEFAULTS:
LOGGER.warning("Removing unknown option '%s' with value '%s'", opt_name, opt_value)
continue
try:
validator.validate_options({ opt_name: opt_value })
except Exception as exc:
LOGGER.warning("Deleting option '%s' with value '%s'; reason: %s", opt_name, opt_value, exc)
continue
if opt_name == 'logging_level':
set_log_level(opt_value)
new_options_data[opt_name] = opt_value
if options_data != new_options_data:
LOGGER.info("Updating options. Old: %s; new: %s", options_data, new_options_data)
#db.put(options.OPTIONS_KEY, new_options_data)
LOGGER.info("Done sanitizing options")


def sanitize_sessions():
LOGGER.info("Sanitizing sessions")
db=dbutils.get_wrapper(db='sessions')
statusdb=dbutils.get_wrapper(db='session_status')
response = db.get_keys()
for st_key in response:
data = db.get(st_key)
try:
validator.validate_session(data)
except Exception as exc:
LOGGER.warning("Deleting session (reason: %s): %s", exc, data)
#db.delete(st_key)
if st_key in statusdb:
LOGGER.warning("Deleting session status %s", st_key)
#statusdb.delete(st_key)
continue

new_key = get_tenant_aware_key(data['name'], data.get("tenant", None)).encode()
if new_key != st_key:
LOGGER.warning("Deleting session. Reason: db key should be '%s' but actually is "
"'%s'. Template = %s", new_key, st_key, data)
#db.delete(st_key)
if st_key in statusdb:
LOGGER.warning("Deleting session status %s", st_key)
#statusdb.delete(st_key)

# Validate session status
if st_key in statusdb:
status_data = statusdb.get(st_key)
try:
validator.validate_extended_session_status(status_data)
except:
LOGGER.warning("key = %s, status_data = %s", st_key, status_data)
LOGGER.exception("Error with session status")

LOGGER.info("Done sanitizing sessions")


def sanitize_components():
LOGGER.info("Sanitizing components")
db=dbutils.get_wrapper(db='components')
boot_artifacts_db=dbutils.get_wrapper(db='bss_tokens_boot_artifacts')
for data in get_v2_components_data():
del_timestamp(data)
try:
validator.validate_component(data)
except:
LOGGER.warning("data = %s", data)
LOGGER.exception("Error with component")
LOGGER.info("Done sanitizing components")


def perform_migrations():
# Not removing this file entirely because we are going to be adding
# code here to migrate the previous BOS data to enforce API field
# restrictions
pass
sanitize_options()
sanitize_session_templates()
sanitize_sessions()
sanitize_components()


def set_log_level(new_level_str):
try:
new_level = logging.getLevelName(new_level_str.upper())
LOGGER.setLevel(new_level)
except Exception as exc:
LOGGER.warning("Error setting log level to '%s' (%d): %s", new_level_str, new_level, exc_type_msg(exc))


def update_log_level():
try:
data = options.get_v2_options_data()
set_log_level(data['logging_level'])
except Exception as exc:
LOGGER.warning("Error checking log level: %s", exc_type_msg(exc))


if __name__ == "__main__":
starting_log_level = os.environ.get('BOS_LOG_LEVEL', 'INFO')
log_level = logging.getLevelName(starting_log_level.upper())
logging.basicConfig(level=log_level)

update_log_level()
backup_bos_data("pre-migration")
perform_migrations()
backup_bos_data("post-migration")
pass
Loading

0 comments on commit a1846f9

Please sign in to comment.