-
Notifications
You must be signed in to change notification settings - Fork 45
feat(trainer): add structured and configurable logging support #110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -44,15 +44,21 @@ def __init__( | |||||||
| ValueError: Invalid backend configuration. | ||||||||
|
|
||||||||
| """ | ||||||||
| logger.debug("Initializing TrainerClient with backend_config=%s", backend_config) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How you propagate your loggers to the clients ? Should it be part of the TrainerClient() property ? |
||||||||
|
|
||||||||
| # initialize training backend | ||||||||
| if not backend_config: | ||||||||
| backend_config = KubernetesBackendConfig() | ||||||||
| logger.debug("Using default KubernetesBackendConfig") | ||||||||
|
|
||||||||
| if isinstance(backend_config, KubernetesBackendConfig): | ||||||||
| self.backend = KubernetesBackend(backend_config) | ||||||||
| logger.debug("Initialized Kubernetes backend") | ||||||||
| elif isinstance(backend_config, LocalProcessBackendConfig): | ||||||||
| self.backend = LocalProcessBackend(backend_config) | ||||||||
| logger.debug("Initialized LocalProcess backend") | ||||||||
| else: | ||||||||
| logger.error("Invalid backend config type: %s", type(backend_config)) | ||||||||
| raise ValueError(f"Invalid backend config '{backend_config}'") | ||||||||
|
|
||||||||
| def list_runtimes(self) -> list[types.Runtime]: | ||||||||
|
|
@@ -119,7 +125,17 @@ def train( | |||||||
| TimeoutError: Timeout to create TrainJobs. | ||||||||
| RuntimeError: Failed to create TrainJobs. | ||||||||
| """ | ||||||||
| return self.backend.train(runtime=runtime, initializer=initializer, trainer=trainer) | ||||||||
| logger.debug( | ||||||||
| "Creating TrainJob with runtime=%s, initializer=%s, trainer=%s", | ||||||||
| runtime, | ||||||||
| initializer, | ||||||||
| trainer, | ||||||||
| ) | ||||||||
|
|
||||||||
| job_id = self.backend.train(runtime=runtime, initializer=initializer, trainer=trainer) | ||||||||
| logger.debug("Successfully created TrainJob with ID: %s", job_id) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move all of the debug statements out of backend, and place them into sdk/kubeflow/trainer/backends/kubernetes/backend.py Lines 254 to 256 in 9642b41
@szaher @kramaranya Does it sound good ? |
||||||||
|
|
||||||||
| return job_id | ||||||||
|
|
||||||||
| def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]: | ||||||||
| """List of the created TrainJobs. If a runtime is specified, only TrainJobs associated with | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| # Copyright 2025 The Kubeflow Authors. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Kubeflow SDK logging module. | ||
|
|
||
| This module provides structured and configurable logging support for the Kubeflow SDK. | ||
| It includes centralized logger configuration, structured log messages, and context-aware logging. | ||
| """ | ||
|
|
||
| from .config import get_logger, setup_logging | ||
| from .formatters import StructuredFormatter | ||
|
|
||
| __all__ = ["get_logger", "setup_logging", "StructuredFormatter"] |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,123 @@ | ||||||
| # Copyright 2025 The Kubeflow Authors. | ||||||
| # | ||||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
| # you may not use this file except in compliance with the License. | ||||||
| # You may obtain a copy of the License at | ||||||
| # | ||||||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| # | ||||||
| # Unless required by applicable law or agreed to in writing, software | ||||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
| # See the License for the specific language governing permissions and | ||||||
| # limitations under the License. | ||||||
|
|
||||||
| """Logging configuration for Kubeflow SDK.""" | ||||||
|
|
||||||
| import logging | ||||||
| import logging.config | ||||||
| import os | ||||||
| from typing import Optional, Union | ||||||
|
|
||||||
|
|
||||||
| def setup_logging( | ||||||
| level: Union[str, int] = "INFO", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we discussed here, do we need to have INFO level for our logger ? #85 (comment) |
||||||
| format_type: str = "console", | ||||||
| log_file: Optional[str] = None, | ||||||
| ) -> None: | ||||||
| """Setup logging configuration for Kubeflow SDK. | ||||||
| Args: | ||||||
| level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) | ||||||
| format_type: Output format type ('console', 'json', 'detailed') | ||||||
| log_file: Optional log file path for file output | ||||||
| """ | ||||||
| # Convert string level to logging constant | ||||||
| if isinstance(level, str): | ||||||
| level = getattr(logging, level.upper(), logging.INFO) | ||||||
|
|
||||||
| # Base configuration | ||||||
| config = { | ||||||
| "version": 1, | ||||||
| "disable_existing_loggers": False, | ||||||
| "formatters": { | ||||||
| "console": { | ||||||
| "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", | ||||||
| "datefmt": "%Y-%m-%d %H:%M:%S", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it is better to make fmt ISO compliant ?
Suggested change
|
||||||
| }, | ||||||
| "detailed": { | ||||||
| "format": ( | ||||||
| "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" | ||||||
| ), | ||||||
| "datefmt": "%Y-%m-%d %H:%M:%S", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| }, | ||||||
| "json": { | ||||||
| "()": "kubeflow.trainer.logging.formatters.StructuredFormatter", | ||||||
| }, | ||||||
| }, | ||||||
| "handlers": { | ||||||
| "console": { | ||||||
| "class": "logging.StreamHandler", | ||||||
| "level": level, | ||||||
| "formatter": format_type, | ||||||
| "stream": "ext://sys.stdout", | ||||||
| }, | ||||||
| }, | ||||||
| "loggers": { | ||||||
| "kubeflow": { | ||||||
| "level": level, | ||||||
| "handlers": ["console"], | ||||||
| "propagate": False, | ||||||
| }, | ||||||
| }, | ||||||
| "root": { | ||||||
| "level": level, | ||||||
| "handlers": ["console"], | ||||||
| }, | ||||||
| } | ||||||
|
|
||||||
| # Add file handler if log_file is specified | ||||||
| if log_file: | ||||||
| config["handlers"]["file"] = { | ||||||
| "class": "logging.FileHandler", | ||||||
| "level": level, | ||||||
| "formatter": format_type, | ||||||
| "filename": log_file, | ||||||
| "mode": "a", | ||||||
| } | ||||||
| config["loggers"]["kubeflow"]["handlers"].append("file") | ||||||
| config["root"]["handlers"].append("file") | ||||||
|
|
||||||
| # Apply configuration | ||||||
| logging.config.dictConfig(config) | ||||||
|
|
||||||
|
|
||||||
| def get_logger(name: str) -> logging.Logger: | ||||||
| """Get a logger instance for the given name. | ||||||
| Args: | ||||||
| name: Logger name, typically __name__ of the calling module | ||||||
| Returns: | ||||||
| Logger instance configured for Kubeflow SDK | ||||||
| """ | ||||||
| # Ensure the logger name starts with 'kubeflow' | ||||||
| if not name.startswith("kubeflow"): | ||||||
| name = f"kubeflow.{name}" | ||||||
|
|
||||||
| return logging.getLogger(name) | ||||||
|
|
||||||
|
|
||||||
| def configure_from_env() -> None: | ||||||
| """Configure logging from environment variables. | ||||||
| Environment variables: | ||||||
| KUBEFLOW_LOG_LEVEL: Logging level (default: INFO) | ||||||
| KUBEFLOW_LOG_FORMAT: Output format (default: console) | ||||||
| KUBEFLOW_LOG_FILE: Log file path (optional) | ||||||
| """ | ||||||
| level = os.getenv("KUBEFLOW_LOG_LEVEL", "INFO") | ||||||
| format_type = os.getenv("KUBEFLOW_LOG_FORMAT", "console") | ||||||
| log_file = os.getenv("KUBEFLOW_LOG_FILE") | ||||||
|
|
||||||
| setup_logging(level=level, format_type=format_type, log_file=log_file) | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| # Copyright 2025 The Kubeflow Authors. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Custom log formatters for Kubeflow SDK.""" | ||
|
|
||
| from datetime import datetime, timezone | ||
| import json | ||
| import logging | ||
|
|
||
|
|
||
| class StructuredFormatter(logging.Formatter): | ||
| """JSON structured formatter for Kubeflow SDK logs. | ||
|
|
||
| This formatter outputs logs in JSON format, making them suitable for | ||
| log aggregation systems like ELK stack, Fluentd, etc. | ||
| """ | ||
|
|
||
| def format(self, record: logging.LogRecord) -> str: | ||
| """Format log record as JSON. | ||
|
|
||
| Args: | ||
| record: Log record to format | ||
|
|
||
| Returns: | ||
| JSON formatted log string | ||
| """ | ||
| log_entry = { | ||
| "timestamp": datetime.now(timezone.utc).isoformat(), | ||
| "level": record.levelname, | ||
| "logger": record.name, | ||
| "message": record.getMessage(), | ||
| "module": record.module, | ||
| "function": record.funcName, | ||
| "line": record.lineno, | ||
| } | ||
|
|
||
| # Add exception info if present | ||
| if record.exc_info: | ||
| log_entry["exception"] = self.formatException(record.exc_info) | ||
|
|
||
| return json.dumps(log_entry, ensure_ascii=False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we move this to a separate package (e.g. kubeflow/logging)