[SDK] Add information about TrainingClient logging (#1973)

* [SDK] Use logging to print Job logs * Update func description * Add info to configure logging in debug mode * Update sdk/python/kubeflow/training/api/training_client.py Co-authored-by: Alex <mythicalsunlight@gmail.com> * Return logs if follow is True --------- Co-authored-by: Alex <mythicalsunlight@gmail.com>
kubeflow · Jan 5, 2024 · ed3168a · ed3168a
1 parent 78be11c
commit ed3168a
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 7 deletions.
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
@@ -41,7 +41,13 @@ def __init__(
         namespace: str = utils.get_default_target_namespace(),
         job_kind: str = constants.PYTORCHJOB_KIND,
     ):
-        """TrainingClient constructor.
+        """TrainingClient constructor. Configure logging in your application
+            as follows to see detailed information from the TrainingClient APIs:
+            .. code-block:: python
+                import logging
+                logging.basicConfig()
+                log = logging.getLogger("kubeflow.training.api.training_client")
+                log.setLevel(logging.DEBUG)
 
         Args:
             config_file: Path to the kube-config file. Defaults to ~/.kube/config.
@@ -771,8 +777,8 @@ def get_job_logs(
         follow: bool = False,
         timeout: int = constants.DEFAULT_TIMEOUT,
     ) -> Dict[str, str]:
-        """Print the training logs for the Job. By default it returns logs from
-        the `master` pod.
+        """Get the logs for every Training Job pod. By default it returns logs from
+        the `master` pod. Logs are returned in this format: { "pod-name": "Log data" }.
 
         Args:
             name: Name for the Job.
@@ -796,7 +802,7 @@ def get_job_logs(
                 For PaddleJob one of `master` or `worker`.
             replica_index: Optional, index for the Job replica.
             container: Pod container to get the logs.
-            follow: Whether to follow the log stream of the pod.
+            follow: Whether to follow the log stream of the pod and print logs to StdOut.
             timeout: Optional, Kubernetes API server timeout in seconds
                 to execute the request.
 
@@ -843,7 +849,7 @@ def get_job_logs(
             while True:
                 for index, log_queue in enumerate(log_queue_pool):
                     if all(finished):
-                        return
+                        return logs_dict
                     if finished[index]:
                         continue
                     # grouping the every 50 log lines of the same pod
@@ -853,7 +859,14 @@ def get_job_logs(
                             if logline is None:
                                 finished[index] = True
                                 break
+
+                            # Print logs to the StdOut
                             print(f"[Pod {pods[index]}]: {logline}")
+                            # Add logs to the results dict.
+                            if pods[index] not in logs_dict:
+                                logs_dict[pods[index]] = logline
+                            else:
+                                logs_dict[pods[index]] += logline
                         except queue.Empty:
                             break
         elif pods:

diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py
@@ -321,7 +321,7 @@ def get_pytorchjob_template(
         ),
     )
 
-    if num_procs_per_worker > 0:
+    if num_procs_per_worker:
         pytorchjob.spec.nproc_per_node = num_procs_per_worker
     if elastic_policy:
         pytorchjob.spec.elastic_policy = elastic_policy
@@ -334,7 +334,7 @@ def get_pytorchjob_template(
             template=master_pod_template_spec,
         )
 
-    if num_worker_replicas >= 1:
+    if num_worker_replicas:
         pytorchjob.spec.pytorch_replica_specs[
             constants.REPLICA_TYPE_WORKER
         ] = models.KubeflowOrgV1ReplicaSpec(