Skip to content

Commit

Permalink
[SDK] Add information about TrainingClient logging (#1973)
Browse files Browse the repository at this point in the history
* [SDK] Use logging to print Job logs

* Update func description

* Add info to configure logging in debug mode

* Update sdk/python/kubeflow/training/api/training_client.py

Co-authored-by: Alex <mythicalsunlight@gmail.com>

* Return logs if follow is True

---------

Co-authored-by: Alex <mythicalsunlight@gmail.com>
  • Loading branch information
andreyvelich and droctothorpe authored Jan 5, 2024
1 parent 78be11c commit ed3168a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
23 changes: 18 additions & 5 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ def __init__(
namespace: str = utils.get_default_target_namespace(),
job_kind: str = constants.PYTORCHJOB_KIND,
):
"""TrainingClient constructor.
"""TrainingClient constructor. Configure logging in your application
as follows to see detailed information from the TrainingClient APIs:
.. code-block:: python
import logging
logging.basicConfig()
log = logging.getLogger("kubeflow.training.api.training_client")
log.setLevel(logging.DEBUG)
Args:
config_file: Path to the kube-config file. Defaults to ~/.kube/config.
Expand Down Expand Up @@ -771,8 +777,8 @@ def get_job_logs(
follow: bool = False,
timeout: int = constants.DEFAULT_TIMEOUT,
) -> Dict[str, str]:
"""Print the training logs for the Job. By default it returns logs from
the `master` pod.
"""Get the logs for every Training Job pod. By default it returns logs from
the `master` pod. Logs are returned in this format: { "pod-name": "Log data" }.
Args:
name: Name for the Job.
Expand All @@ -796,7 +802,7 @@ def get_job_logs(
For PaddleJob one of `master` or `worker`.
replica_index: Optional, index for the Job replica.
container: Pod container to get the logs.
follow: Whether to follow the log stream of the pod.
follow: Whether to follow the log stream of the pod and print logs to StdOut.
timeout: Optional, Kubernetes API server timeout in seconds
to execute the request.
Expand Down Expand Up @@ -843,7 +849,7 @@ def get_job_logs(
while True:
for index, log_queue in enumerate(log_queue_pool):
if all(finished):
return
return logs_dict
if finished[index]:
continue
# grouping the every 50 log lines of the same pod
Expand All @@ -853,7 +859,14 @@ def get_job_logs(
if logline is None:
finished[index] = True
break

# Print logs to the StdOut
print(f"[Pod {pods[index]}]: {logline}")
# Add logs to the results dict.
if pods[index] not in logs_dict:
logs_dict[pods[index]] = logline
else:
logs_dict[pods[index]] += logline
except queue.Empty:
break
elif pods:
Expand Down
4 changes: 2 additions & 2 deletions sdk/python/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def get_pytorchjob_template(
),
)

if num_procs_per_worker > 0:
if num_procs_per_worker:
pytorchjob.spec.nproc_per_node = num_procs_per_worker
if elastic_policy:
pytorchjob.spec.elastic_policy = elastic_policy
Expand All @@ -334,7 +334,7 @@ def get_pytorchjob_template(
template=master_pod_template_spec,
)

if num_worker_replicas >= 1:
if num_worker_replicas:
pytorchjob.spec.pytorch_replica_specs[
constants.REPLICA_TYPE_WORKER
] = models.KubeflowOrgV1ReplicaSpec(
Expand Down

0 comments on commit ed3168a

Please sign in to comment.