Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SDK] Add information about TrainingClient logging #1973

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ def __init__(
namespace: str = utils.get_default_target_namespace(),
job_kind: str = constants.PYTORCHJOB_KIND,
):
"""TrainingClient constructor.
"""TrainingClient constructor. Configure logging in your application
as follows to see detailed information from the TrainingClient APIs:
.. code-block:: python
import logging
logging.basicConfig()
log = logging.getLogger("kubeflow.training.api.training_client")
log.setLevel(logging.DEBUG)

Args:
config_file: Path to the kube-config file. Defaults to ~/.kube/config.
Expand Down Expand Up @@ -771,8 +777,8 @@ def get_job_logs(
follow: bool = False,
timeout: int = constants.DEFAULT_TIMEOUT,
) -> Dict[str, str]:
"""Print the training logs for the Job. By default it returns logs from
the `master` pod.
"""Get the logs for every Training Job pod. By default it returns logs from
the `master` pod. Logs are returned in this format: { "pod-name": "Log data" }.

Args:
name: Name for the Job.
Expand All @@ -796,7 +802,7 @@ def get_job_logs(
For PaddleJob one of `master` or `worker`.
replica_index: Optional, index for the Job replica.
container: Pod container to get the logs.
follow: Whether to follow the log stream of the pod.
follow: Whether to follow the log stream of the pod and print logs to StdOut.
timeout: Optional, Kubernetes API server timeout in seconds
to execute the request.

Expand Down Expand Up @@ -843,7 +849,7 @@ def get_job_logs(
while True:
for index, log_queue in enumerate(log_queue_pool):
if all(finished):
return
return logs_dict
if finished[index]:
continue
# grouping the every 50 log lines of the same pod
Expand All @@ -853,7 +859,14 @@ def get_job_logs(
if logline is None:
finished[index] = True
break

# Print logs to the StdOut
print(f"[Pod {pods[index]}]: {logline}")
# Add logs to the results dict.
if pods[index] not in logs_dict:
logs_dict[pods[index]] = logline
else:
logs_dict[pods[index]] += logline
except queue.Empty:
break
elif pods:
Expand Down
4 changes: 2 additions & 2 deletions sdk/python/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def get_pytorchjob_template(
),
)

if num_procs_per_worker > 0:
if num_procs_per_worker:
pytorchjob.spec.nproc_per_node = num_procs_per_worker
if elastic_policy:
pytorchjob.spec.elastic_policy = elastic_policy
Expand All @@ -334,7 +334,7 @@ def get_pytorchjob_template(
template=master_pod_template_spec,
)

if num_worker_replicas >= 1:
if num_worker_replicas:
pytorchjob.spec.pytorch_replica_specs[
constants.REPLICA_TYPE_WORKER
] = models.KubeflowOrgV1ReplicaSpec(
Expand Down