From 27012fd71d66521739da2fe69e334034ced4fcb4 Mon Sep 17 00:00:00 2001 From: jlashner Date: Tue, 24 Sep 2024 09:55:36 -0700 Subject: [PATCH 1/3] Clear data script --- scripts/clear_old_data.py | 179 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 scripts/clear_old_data.py diff --git a/scripts/clear_old_data.py b/scripts/clear_old_data.py new file mode 100644 index 00000000..cf59638f --- /dev/null +++ b/scripts/clear_old_data.py @@ -0,0 +1,179 @@ +from dataclasses import dataclass +import os +import datetime +from typing import List, Optional +from enum import Enum, auto +import yaml +import shutil +import argparse + + +@dataclass +class Config: + dry: bool = True + delete_smurf_data_after_days: int = 31 + delete_timestream_data_after_days: int = 365 // 2 + delete_core_dumps_after_days: int = 365 + delete_logs_after_days: int = 365 + verbose: bool = True + + @classmethod + def from_yaml(cls, path) -> "Config": + with open(path, 'r') as f: + return cls(**yaml.safe_load(f)) + + @classmethod + def from_args(cls, args_list: Optional[List[str]]=None) -> "Config": + parser = argparse.ArgumentParser() + parser.add_argument('--dry', action='store_true') + parser.add_argument('--verbose', action='store_true') + args = parser.parse_args(args_list) + return cls( + verbose=args.verbose, + dry=args.dry + ) + + +class FileType(Enum): + SmurfData = auto() + TimestreamData = auto() + LogData = auto() + CoreDump = auto() + +@dataclass +class FileInfo: + path: str + dt: datetime.datetime + file_type: FileType + + +def create_smurf_date_dir(cfg: Config, path: str) -> Optional[FileInfo]: + dirname = os.path.split(path)[1] + try: + year = int(dirname[:4]) + month = int(dirname[4:6]) + day = int(dirname[6:]) + dt = datetime.datetime(year=year, month=month, day=day) + except Exception: + if cfg.verbose: + print(f"Could not parse datetime: {dirname}") + return None + return FileInfo(path=path, dt=dt, file_type=FileType.SmurfData) + + +def scan_smurf_data(cfg: Config) -> List[FileInfo]: + date_dirs: List[FileInfo] = [] + base_dir = '/data/smurf_data' + now = datetime.datetime.now() + max_time_delta = datetime.timedelta(days=cfg.delete_smurf_data_after_days) + for d in os.listdir(base_dir): + result = create_smurf_date_dir(cfg, os.path.join(base_dir, d)) + if result is not None: + if now - result.dt > max_time_delta: + date_dirs.append(result) + + return sorted(date_dirs, key=lambda f:f.dt) + + +def scan_timestream_dirs(cfg: Config) -> List[FileInfo]: + timestream_dirs: List[FileInfo] = [] + base_dir = '/data/so/timestreams' + now = datetime.datetime.now() + max_time_delta = datetime.timedelta(days=cfg.delete_timestream_data_after_days) + for d in os.listdir(base_dir): # d is 5-digit ctime code + path = os.path.join(base_dir, d) + try: + # Give one day buffer for timezone effects, etc. + timestamp = (int(d) + 1)* 1e5 + dt = datetime.datetime.fromtimestamp(timestamp) + except ValueError: + if cfg.verbose: + print(f"Could not parse datetime: {path}") + continue + file = FileInfo( + path=path, + dt=dt, + file_type=FileType.TimestreamData + ) + if now - file.dt > max_time_delta: + timestream_dirs.append(file) + return sorted(timestream_dirs, key=lambda f:f.dt) + +def scan_core_dumps(cfg: Config) -> List[FileInfo]: + core_dump_dir = '/data/cores' + files: List[FileInfo] = [] + now = datetime.datetime.now() + max_time_delta = datetime.timedelta( + days=cfg.delete_core_dumps_after_days + ) + for f in os.listdir(core_dump_dir): + path = os.path.join(core_dump_dir, f) + ts = os.path.getctime(path) + dt = datetime.datetime.fromtimestamp(ts) + if now - dt > max_time_delta: + files.append(FileInfo( + path=path, dt=dt, file_type=FileType.CoreDump + )) + return files + +def scan_log_dirs(cfg: Config) -> List[FileInfo]: + log_dir = '/data/logs' + files: List[FileInfo] = [] + now = datetime.datetime.now() + max_time_delta = datetime.timedelta( + days=cfg.delete_logs_after_days + ) + for f in os.listdir(log_dir): + path = os.path.join(log_dir, f) + ts = os.path.getmtime(path) + dt = datetime.datetime.fromtimestamp(ts) + if now - dt > max_time_delta: + files.append(FileInfo( + path=path, dt=dt, file_type=FileType.CoreDump + )) + return files + + +def remove_file(cfg: Config, file: FileInfo): + if cfg.dry: + if os.path.isdir(file.path): + print(f"dry mode: rm -rf {file.path}") + else: + print(f"rm {file.path}") + else: + if os.path.isdir(file.path): + shutil.rmtree(file.path) + else: + os.remove(file.path) + + +def main(cfg: Config) -> None: + now = datetime.datetime.now() + files_to_delete: List[FileInfo] = [] + files_to_delete += scan_smurf_data(cfg) + files_to_delete += scan_timestream_dirs(cfg) + files_to_delete += scan_core_dumps(cfg) + files_to_delete += scan_log_dirs(cfg) + + print(f"{len(files_to_delete)} files to delete:") + for f in files_to_delete: + days_old = (now - f.dt).days + print(f' - {f.path} ({days_old} days old)') + + if len(files_to_delete) == 0: + print("No files to delete") + return + + resp = input("Proceed with deletion? [y/n] ") + if resp.strip().lower() != 'y': + print("Not proceed with deletion") + return + + print("Deleting files") + for f in files_to_delete: + remove_file(cfg, f) + +if __name__ == '__main__': + cfg = Config.from_args() + main(cfg) + From bf760b22cc7efd8fb6b1106182b0a436b234f7b8 Mon Sep 17 00:00:00 2001 From: jlashner Date: Tue, 1 Oct 2024 13:33:42 -0700 Subject: [PATCH 2/3] Adds more docs, and --config-file option to argparse --- scripts/clear_old_data.py | 59 ++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/scripts/clear_old_data.py b/scripts/clear_old_data.py index cf59638f..2b69198f 100644 --- a/scripts/clear_old_data.py +++ b/scripts/clear_old_data.py @@ -1,3 +1,17 @@ +""" +Script for deleting old data on the smurf-server. + +Configuration parameters can be set via a config file: +``` +python3 clear_old_data.py --config-file clear_data_cfg.yaml +``` +where the entries in the config file map onto the Config class below. + +Specific configuration settings can be set directly from the command line, such as: +``` +python3 clear_old_data.py --dry +``` +""" from dataclasses import dataclass import os import datetime @@ -10,28 +24,59 @@ @dataclass class Config: - dry: bool = True + """ + Configuration object to control the behavior of the data deletion script. + + Args + ----- + dry: bool + If true, will do a dry-run of the data-deletion without deleting any + files. Logs will be printed for all of the files that would be deleted. + verbose: bool + If true, logs will be more verbose. + delete_smurf_data_after_days: int + Days after which smurf data will be deleted. + delete_timestream_data_after_days: int + Days after which timestream data will be deleted. + delete_core_dumps_after_days: int + Days after which core-dumps will be deleted. + delete_logs_after_days: int + Days after which log directories will be deleted. + """ + dry: bool = False + verbose: bool = False delete_smurf_data_after_days: int = 31 delete_timestream_data_after_days: int = 365 // 2 delete_core_dumps_after_days: int = 365 - delete_logs_after_days: int = 365 - verbose: bool = True + delete_logs_after_days: int = 365 * 5 @classmethod def from_yaml(cls, path) -> "Config": + """ + Creates a Config object based on a yaml file. Key names in the file must + match config dataclass fields. + """ with open(path, 'r') as f: return cls(**yaml.safe_load(f)) @classmethod def from_args(cls, args_list: Optional[List[str]]=None) -> "Config": parser = argparse.ArgumentParser() + parser.add_argument('--config-file', type=str, default=None) parser.add_argument('--dry', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args(args_list) - return cls( - verbose=args.verbose, - dry=args.dry - ) + + if args.config_file: + cfg = cls.from_yaml(args.config_file) + else: + cfg = cls() + + if args.dry: + cfg.dry = args.dry + if args.verbose: + cfg.verbose = args.verbose + return cfg class FileType(Enum): From 9943d807243dd679f6b48173422459c38f5b5797 Mon Sep 17 00:00:00 2001 From: simonscryo Date: Wed, 2 Oct 2024 14:47:58 -0700 Subject: [PATCH 3/3] Adds logging functionality and log_file option --- scripts/clear_old_data.py | 77 ++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/scripts/clear_old_data.py b/scripts/clear_old_data.py index 2b69198f..d4efb6d3 100644 --- a/scripts/clear_old_data.py +++ b/scripts/clear_old_data.py @@ -15,12 +15,24 @@ from dataclasses import dataclass import os import datetime -from typing import List, Optional +from typing import List, Optional, Dict, Any from enum import Enum, auto import yaml import shutil import argparse +import logging +from copy import deepcopy +logger = logging.getLogger() + +_nameToLogLevel = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARN': logging.WARNING, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL, +} @dataclass class Config: @@ -44,11 +56,21 @@ class Config: Days after which log directories will be deleted. """ dry: bool = False - verbose: bool = False delete_smurf_data_after_days: int = 31 delete_timestream_data_after_days: int = 365 // 2 delete_core_dumps_after_days: int = 365 delete_logs_after_days: int = 365 * 5 + log_level: int = logging.INFO + log_file: Optional[str] = '/data/logs/clear_data.log' + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Config": + _data = deepcopy(data) + if 'log_level' in data: + if isinstance(data['log_level'], str): + _data['log_level'] = _nameToLogLevel[data['log_level'].upper()] + return cls(**_data) + @classmethod def from_yaml(cls, path) -> "Config": @@ -57,14 +79,13 @@ def from_yaml(cls, path) -> "Config": match config dataclass fields. """ with open(path, 'r') as f: - return cls(**yaml.safe_load(f)) + return cls.from_dict(yaml.safe_load(f)) @classmethod def from_args(cls, args_list: Optional[List[str]]=None) -> "Config": parser = argparse.ArgumentParser() parser.add_argument('--config-file', type=str, default=None) parser.add_argument('--dry', action='store_true') - parser.add_argument('--verbose', action='store_true') args = parser.parse_args(args_list) if args.config_file: @@ -74,10 +95,28 @@ def from_args(cls, args_list: Optional[List[str]]=None) -> "Config": if args.dry: cfg.dry = args.dry - if args.verbose: - cfg.verbose = args.verbose return cfg +def setup_logger(cfg: Config) -> None: + logger.setLevel(cfg.log_level) + if len(logger.handlers) > 0: + logger.error("Logger has already been configured! Doing nothing") + return + + formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') + + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(cfg.log_level) + ch.setFormatter(formatter) + logger.addHandler(ch) + + if cfg.log_file is not None: + file_handler = logging.FileHandler(cfg.log_file) + file_handler.setLevel(cfg.log_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + class FileType(Enum): SmurfData = auto() @@ -100,8 +139,7 @@ def create_smurf_date_dir(cfg: Config, path: str) -> Optional[FileInfo]: day = int(dirname[6:]) dt = datetime.datetime(year=year, month=month, day=day) except Exception: - if cfg.verbose: - print(f"Could not parse datetime: {dirname}") + logger.debug(f"Could not parse datetime: {dirname}") return None return FileInfo(path=path, dt=dt, file_type=FileType.SmurfData) @@ -132,8 +170,7 @@ def scan_timestream_dirs(cfg: Config) -> List[FileInfo]: timestamp = (int(d) + 1)* 1e5 dt = datetime.datetime.fromtimestamp(timestamp) except ValueError: - if cfg.verbose: - print(f"Could not parse datetime: {path}") + logger.debug(f"Could not parse datetime: {path}") continue file = FileInfo( path=path, @@ -182,9 +219,9 @@ def scan_log_dirs(cfg: Config) -> List[FileInfo]: def remove_file(cfg: Config, file: FileInfo): if cfg.dry: if os.path.isdir(file.path): - print(f"dry mode: rm -rf {file.path}") + logger.info(f"dry mode: rm -rf {file.path}") else: - print(f"rm {file.path}") + logger.info(f"rm {file.path}") else: if os.path.isdir(file.path): shutil.rmtree(file.path) @@ -193,6 +230,10 @@ def remove_file(cfg: Config, file: FileInfo): def main(cfg: Config) -> None: + setup_logger(cfg) + logger.info('-'*80) + + logger.info(cfg) now = datetime.datetime.now() files_to_delete: List[FileInfo] = [] files_to_delete += scan_smurf_data(cfg) @@ -200,21 +241,21 @@ def main(cfg: Config) -> None: files_to_delete += scan_core_dumps(cfg) files_to_delete += scan_log_dirs(cfg) - print(f"{len(files_to_delete)} files to delete:") + logger.info(f"{len(files_to_delete)} files to delete:") for f in files_to_delete: days_old = (now - f.dt).days - print(f' - {f.path} ({days_old} days old)') + logger.info(f' - {f.path} ({days_old} days old)') if len(files_to_delete) == 0: - print("No files to delete") + logger.info("No files to delete") return - resp = input("Proceed with deletion? [y/n] ") + resp = input(f"Proceed with deletion (dry={cfg.dry})? [y/n] ") if resp.strip().lower() != 'y': - print("Not proceed with deletion") + logger.info("Not proceed with deletion") return - print("Deleting files") + logger.info("Deleting files") for f in files_to_delete: remove_file(cfg, f)