diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index ae0a28b52ed..c7c31960482 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -11,4 +11,4 @@ Tested (run the relevant ones):
 - [ ] Any manual or new tests for this PR (please specify below)
 - [ ] All smoke tests: `pytest tests/test_smoke.py` 
 - [ ] Relevant individual smoke tests: `pytest tests/test_smoke.py::test_fill_in_the_name` 
-- [ ] Backward compatibility tests: `bash tests/backward_comaptibility_tests.sh`
+- [ ] Backward compatibility tests: `conda deactivate; bash -i tests/backward_compatibility_tests.sh`
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 87d8cea9f16..bf84bea4d50 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -27,7 +27,7 @@ jobs:
           - tests/test_optimizer_random_dag.py
           - tests/test_storage.py
           - tests/test_wheels.py
-          - tests/test_spot_serve.py
+          - tests/test_jobs_and_serve.py
           - tests/test_yaml_parser.py
     runs-on: ubuntu-latest
     steps:
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 00683c82f6b..11affaf4c43 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -26,6 +26,7 @@ document.addEventListener('DOMContentLoaded', () => {
     // New items:
     const newItems = [
         { selector: '.caption-text', text: 'SkyServe: Model Serving' },
+        { selector: '.toctree-l1 > a', text: 'Managed Jobs' },
         { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' },
         { selector: '.toctree-l1 > a', text: 'DBRX (Databricks)' },
         { selector: '.toctree-l1 > a', text: 'Ollama' },
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index 412e3284372..4892e869d3c 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -121,7 +121,7 @@ Contents
    :maxdepth: 1
    :caption: Running Jobs
 
-   ../examples/spot-jobs
+   ../examples/managed-jobs
    ../reference/job-queue
    ../examples/auto-failover
    ../reference/kubernetes/index
@@ -139,7 +139,7 @@ Contents
    :maxdepth: 1
    :caption: Cutting Cloud Costs
 
-   ../examples/spot-jobs
+   Managed Spot Jobs <../examples/spot-jobs>
    ../reference/auto-stop
    ../reference/benchmark/index
 
diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst
new file mode 100644
index 00000000000..ba449c1f087
--- /dev/null
+++ b/docs/source/examples/managed-jobs.rst
@@ -0,0 +1,465 @@
+.. _managed-jobs:
+
+Managed Jobs
+============
+
+.. tip::
+
+  This feature is great for scaling out: running a single job for long durations, or running many jobs (pipelines).
+
+SkyPilot supports **managed jobs**, which can automatically recover from any spot preemptions or hardware failures.
+It can be used in three modes:
+
+#. :ref:`Managed Spot Jobs <spot-jobs>`: Jobs run on auto-recovering spot instances. This can **save significant costs** (e.g., up to 70\% for GPU VMs) by making preemptible spot instances useful for long-running jobs.
+#. :ref:`On-demand <on-demand>`: Jobs run on auto-recovering on-demand instances. This is useful for jobs that require guaranteed resources.
+#. :ref:`Pipelines <pipeline>`: Run pipelines that contain multiple tasks (which can have different resource requirements and ``setup``/``run`` commands). This is useful for running a sequence of tasks that depend on each other, e.g., data processing, training a model, and then running inference on it.
+
+
+.. _spot-jobs:
+
+Managed Spot Jobs
+-----------------
+
+SkyPilot automatically finds available spot resources across regions and clouds to maximize availability.
+Any spot preemptions are automatically handled by SkyPilot without user intervention.
+
+Here is an example of a BERT training job failing over different regions across AWS and GCP.
+
+.. image:: https://i.imgur.com/Vteg3fK.gif
+  :width: 600
+  :alt: GIF for BERT training on Spot V100
+
+.. image:: ../images/spot-training.png
+  :width: 600
+  :alt: Static plot, BERT training on Spot V100
+
+To use managed spot jobs, there are two requirements:
+
+#. :ref:`Job YAML <job-yaml>`: Managed Spot requires a YAML to describe the job, tested with :code:`sky launch`.
+#. :ref:`Checkpointing <checkpointing>` (optional): For job recovery due to preemptions, the user application code can checkpoint its progress periodically to a :ref:`mounted cloud bucket <sky-storage>`. The program can reload the latest checkpoint when restarted.
+
+
+.. _job-yaml:
+
+Job YAML
+~~~~~~~~
+
+To launch a managed job, you can simply reuse your job YAML (recommended to test it with :code:`sky launch` first).
+For example, we found the BERT fine-tuning YAML works with :code:`sky launch`, and want to
+launch it with SkyPilot managed spot jobs.
+
+We can launch it with the following:
+
+.. code-block:: console
+
+  $ sky jobs launch -n bert-qa bert_qa.yaml
+
+
+.. code-block:: yaml
+
+  # bert_qa.yaml
+  name: bert-qa
+
+  resources:
+    accelerators: V100:1
+    # Use spot instances to save cost.
+    use_spot: true
+
+  # Assume your working directory is under `~/transformers`.
+  # To make this example work, please run the following command:
+  # git clone https://github.com/huggingface/transformers.git ~/transformers -b v4.30.1
+  workdir: ~/transformers
+
+  setup: |
+    # Fill in your wandb key: copy from https://wandb.ai/authorize
+    # Alternatively, you can use `--env WANDB_API_KEY=$WANDB_API_KEY`
+    # to pass the key in the command line, during `sky spot launch`.
+    echo export WANDB_API_KEY=[YOUR-WANDB-API-KEY] >> ~/.bashrc
+
+    pip install -e .
+    cd examples/pytorch/question-answering/
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+    pip install wandb
+
+  run: |
+    cd ./examples/pytorch/question-answering/
+    python run_qa.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 12 \
+    --learning_rate 3e-5 \
+    --num_train_epochs 50 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --report_to wandb
+
+
+.. note::
+
+  :ref:`workdir <sync-code-artifacts>` and :ref:`file mounts with local files <sync-code-artifacts>` will be automatically uploaded to a
+  :ref:`cloud bucket <sky-storage>`. The bucket will be created during the job running time, and cleaned up after the job
+  finishes.
+
+SkyPilot will launch and start monitoring the job. When a spot preemption or any machine failure happens, SkyPilot will automatically
+search for resources across regions and clouds to re-launch the job.
+
+In this example, the job will be restarted from scratch after each preemption recovery.
+To resume the job from previous states, user's application needs to implement checkpointing and recovery.
+
+
+.. _checkpointing:
+
+Checkpointing and Recovery
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To allow job recovery, a cloud bucket is typically needed to store the job's states (e.g., model checkpoints).
+Below is an example of mounting a bucket to :code:`/checkpoint`.
+
+.. code-block:: yaml
+
+  file_mounts:
+    /checkpoint:
+      name: # NOTE: Fill in your bucket name
+      mode: MOUNT
+
+The :code:`MOUNT` mode in :ref:`SkyPilot bucket mounting <sky-storage>` ensures the checkpoints outputted to :code:`/checkpoint` are automatically synced to a persistent bucket.
+Note that the application code should save program checkpoints periodically and reload those states when the job is restarted.
+This is typically achieved by reloading the latest checkpoint at the beginning of your program.
+
+.. _spot-jobs-end-to-end:
+
+An End-to-End Example
+~~~~~~~~~~~~~~~~~~~~~
+
+Below we show an `example <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/bert_qa.yaml>`_ for fine-tuning a BERT model on a question-answering task with HuggingFace.
+
+.. code-block:: yaml
+  :emphasize-lines: 13-16,42-45
+
+  # bert_qa.yaml
+  name: bert-qa
+
+  resources:
+    accelerators: V100:1
+    use_spot: true
+
+  # Assume your working directory is under `~/transformers`.
+  # To make this example work, please run the following command:
+  # git clone https://github.com/huggingface/transformers.git ~/transformers -b v4.30.1
+  workdir: ~/transformers
+
+  file_mounts:
+    /checkpoint:
+      name: # NOTE: Fill in your bucket name
+      mode: MOUNT
+
+  setup: |
+    # Fill in your wandb key: copy from https://wandb.ai/authorize
+    # Alternatively, you can use `--env WANDB_API_KEY=$WANDB_API_KEY`
+    # to pass the key in the command line, during `sky jobs launch`.
+    echo export WANDB_API_KEY=[YOUR-WANDB-API-KEY] >> ~/.bashrc
+
+    pip install -e .
+    cd examples/pytorch/question-answering/
+    pip install -r requirements.txt
+    pip install wandb
+
+  run: |
+    cd ./examples/pytorch/question-answering/
+    python run_qa.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 12 \
+    --learning_rate 3e-5 \
+    --num_train_epochs 50 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --report_to wandb \
+    --run_name $SKYPILOT_TASK_ID \
+    --output_dir /checkpoint/bert_qa/ \
+    --save_total_limit 10 \
+    --save_steps 1000
+
+
+
+As HuggingFace has built-in support for periodically checkpointing, we only need to pass the highlighted arguments for setting up
+the output directory and frequency of checkpointing (see more
+on `Huggingface API <https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_steps>`_).
+You may also refer to another example `here <https://github.com/skypilot-org/skypilot/tree/master/examples/spot/resnet_ddp>`__ for periodically checkpointing with PyTorch.
+
+We also set :code:`--run_name` to :code:`$SKYPILOT_TASK_ID` so that the logs for all recoveries of the same job will be saved
+to the same run in Weights & Biases.
+
+.. note::
+  The environment variable :code:`$SKYPILOT_TASK_ID` (example: "sky-managed-2022-10-06-05-17-09-750781_bert-qa_8-0") can be used to identify the same job, i.e., it is kept identical across all
+  recoveries of the job.
+  It can be accessed in the task's :code:`run` commands or directly in the program itself (e.g., access
+  via :code:`os.environ` and pass to Weights & Biases for tracking purposes in your training script). It is made available to
+  the task whenever it is invoked.
+
+With the highlighted changes, the managed spot job can now resume training after preemption! We can enjoy the benefits of
+cost savings from spot instances without worrying about preemption or losing progress.
+
+.. code-block:: console
+
+  $ sky jobs launch -n bert-qa bert_qa.yaml
+
+.. tip::
+
+  Try copy-paste this example and adapt it to your own job.
+
+
+
+Real-World Examples
+~~~~~~~~~~~~~~~~~~~
+
+* `Vicuna <https://vicuna.lmsys.org/>`_ LLM chatbot: `instructions <https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna>`_, `YAML <https://github.com/skypilot-org/skypilot/blob/master/llm/vicuna/train.yaml>`__
+* BERT (shown above): `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/bert_qa.yaml>`__
+* PyTorch DDP, ResNet: `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/resnet.yaml>`__
+* PyTorch Lightning DDP, CIFAR-10: `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/lightning_cifar10.yaml>`__
+
+
+.. _on-demand:
+
+Using On-Demand Instances
+--------------------------------
+
+The same ``sky jobs launch`` and YAML interfaces can run jobs on auto-recovering
+on-demand instances. This is useful to have SkyPilot monitor any underlying
+machine failures and transparently recover the job.
+
+To do so, simply set :code:`use_spot: false` in the :code:`resources` section, or override it with :code:`--use-spot false` in the CLI.
+
+.. code-block:: console
+
+  $ sky jobs launch -n bert-qa bert_qa.yaml --use-spot false
+
+.. tip::
+
+  It is useful to think of ``sky jobs launch`` as a "serverless" managed job
+  interface, while ``sky launch`` is a cluster interface (that you can launch
+  tasks on, albeit not managed).
+
+Either Spot Or On-Demand
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can use ``any_of`` to specify either spot or on-demand instances as
+candidate resources for a job. See documentation :ref:`here
+<multiple-resources>` for more details.
+
+.. code-block:: yaml
+
+  resources:
+    accelerators: A100:8
+    any_of:
+      - use_spot: true
+      - use_spot: false
+
+In this example, SkyPilot will perform cost optimizations to select the resource to use, which almost certainly
+will be spot instances. If spot instances are not available, SkyPilot will fall back to launch on-demand instances.
+
+More advanced policies for resource selection, such as the `Can't Be Late
+<https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao>`__ (NSDI'24)
+paper, may be supported in the future.
+
+Useful CLIs
+-----------
+
+Here are some commands for managed jobs. Check :code:`sky jobs --help` and :ref:`CLI reference <cli>` for more details.
+
+See all managed jobs:
+
+.. code-block:: console
+
+  $ sky jobs queue
+
+.. code-block:: console
+
+  Fetching managed job statuses...
+  Managed jobs:
+  ID NAME     RESOURCES           SUBMITTED   TOT. DURATION   JOB DURATION   #RECOVERIES  STATUS
+  2  roberta  1x [A100:8][Spot]   2 hrs ago   2h 47m 18s      2h 36m 18s     0            RUNNING
+  1  bert-qa  1x [V100:1][Spot]   4 hrs ago   4h 24m 26s      4h 17m 54s     0            RUNNING
+
+Stream the logs of a running managed job:
+
+.. code-block:: console
+
+  $ sky jobs logs -n bert-qa  # by name
+  $ sky jobs logs 2           # by job ID
+
+Cancel a managed job:
+
+.. code-block:: console
+
+  $ sky jobs cancel -n bert-qa  # by name
+  $ sky jobs cancel 2           # by job ID
+
+.. note::
+  If any failure happens for a managed job, you can check :code:`sky jobs queue -a` for the brief reason
+  of the failure. For more details, it would be helpful to check :code:`sky jobs logs --controller <job_id>`.
+
+
+.. _pipeline:
+
+Job Pipelines
+-------------
+
+A pipeline is a managed job that contains a sequence of tasks running one after another.
+
+This is useful for running a sequence of tasks that depend on each other, e.g., training a model and then running inference on it.
+Different tasks can have different resource requirements to use appropriate per-task resources, which saves costs, while  keeping the burden of managing the tasks off the user.
+
+.. note::
+  In other words, a managed job is either a single task or a pipeline of tasks. All managed jobs are submitted by :code:`sky jobs launch`.
+
+To run a pipeline, specify the sequence of tasks in a YAML file. Here is an example:
+
+.. code-block:: yaml
+
+  name: pipeline
+
+  ---
+
+  name: train
+
+  resources:
+    accelerators: V100:8
+    any_of:
+      - use_spot: true
+      - use_spot: false
+
+  file_mounts:
+    /checkpoint:
+      name: train-eval # NOTE: Fill in your bucket name
+      mode: MOUNT
+
+  setup: |
+    echo setup for training
+
+  run: |
+    echo run for training
+    echo save checkpoints to /checkpoint
+
+  ---
+
+  name: eval
+
+  resources:
+    accelerators: T4:1
+    use_spot: false
+
+  file_mounts:
+    /checkpoint:
+      name: train-eval # NOTE: Fill in your bucket name
+      mode: MOUNT
+
+  setup: |
+    echo setup for eval
+
+  run: |
+    echo load trained model from /checkpoint
+    echo eval model on test set
+
+
+The YAML above defines a pipeline with two tasks. The first :code:`name:
+pipeline` names the pipeline. The first task has name :code:`train` and the
+second task has name :code:`eval`. The tasks are separated by a line with three
+dashes :code:`---`. Each task has its own :code:`resources`, :code:`setup`, and
+:code:`run` sections. Tasks are executed sequentially.
+
+To submit the pipeline, the same command :code:`sky jobs launch` is used. The pipeline will be automatically launched and monitored by SkyPilot. You can check the status of the pipeline with :code:`sky jobs queue` or :code:`sky jobs dashboard`.
+
+.. code-block:: console
+
+  $ sky jobs launch -n pipeline pipeline.yaml
+  $ sky jobs queue
+  Fetching managed job statuses...
+  Managed jobs
+  In progress jobs: 1 RECOVERING
+  ID  TASK  NAME      RESOURCES                    SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS
+  8         pipeline  -                            50 mins ago  47m 45s        -             1            RECOVERING
+   ↳  0     train     1x [V100:8][Spot|On-demand]  50 mins ago  47m 45s        -             1            RECOVERING
+   ↳  1     eval      1x [T4:1]                    -            -              -             0            PENDING
+
+.. note::
+
+  The :code:`$SKYPILOT_TASK_ID` environment variable is also available in the :code:`run` section of each task. It is unique for each task in the pipeline.
+  For example, the :code:`$SKYPILOT_TASK_ID` for the :code:`eval` task above is:
+  "sky-managed-2022-10-06-05-17-09-750781_pipeline_eval_8-1".
+
+
+
+Dashboard
+---------
+
+Use ``sky jobs dashboard`` to open a dashboard to see all jobs:
+
+.. code-block:: console
+
+  $ sky jobs dashboard
+
+This automatically opens a browser tab to show the dashboard:
+
+.. image:: ../images/job-dashboard.png
+
+The UI shows the same information as the CLI ``sky jobs queue -a``. The UI is
+especially useful when there are many in-progress jobs to monitor, which the
+terminal-based CLI may need more than one page to display.
+
+
+Concept: Jobs Controller
+------------------------
+
+The jobs controller is a small on-demand CPU VM running in the cloud that manages all jobs of a user.
+It is automatically launched when the first managed job is submitted, and it is autostopped after it has been idle for 10 minutes (i.e., after all managed jobs finish and no new managed job is submitted in that duration).
+Thus, **no user action is needed** to manage its lifecycle.
+
+You can see the controller with :code:`sky status` and refresh its status by using the :code:`-r/--refresh` flag.
+
+While the cost of the jobs controller is negligible (~$0.4/hour when running and less than $0.004/hour when stopped),
+you can still tear it down manually with
+:code:`sky down <job-controller-name>`, where the ``<job-controller-name>`` can be found in the output of :code:`sky status`.
+
+.. note::
+  Tearing down the jobs controller loses all logs and status information for the finished managed jobs. It is only allowed when there are no in-progress managed jobs to ensure no resource leakage.
+
+Customizing Job Controller Resources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may want to customize the resources of the jobs controller for several reasons:
+
+#. Changing the maximum number of jobs that can be run concurrently, which is 2x the vCPUs of the controller. (Default: 16)
+#. Use a lower-cost controller (if you have a low number of concurrent managed jobs).
+#. Enforcing the jobs controller to run on a specific location. (Default: cheapest location)
+#. Changing the disk_size of the jobs controller to store more logs. (Default: 50GB)
+
+To achieve the above, you can specify custom configs in :code:`~/.sky/config.yaml` with the following fields:
+
+.. code-block:: yaml
+
+  jobs:
+    # NOTE: these settings only take effect for a new jobs controller, not if
+    # you have an existing one.
+    controller:
+      resources:
+        # All configs below are optional.
+        # Specify the location of the jobs controller.
+        cloud: gcp
+        region: us-central1
+        # Specify the maximum number of managed jobs that can be run concurrently.
+        cpus: 4+  # number of vCPUs, max concurrent jobs = 2 * cpus
+        # Specify the disk_size in GB of the jobs controller.
+        disk_size: 100
+
+The :code:`resources` field has the same spec as a normal SkyPilot job; see `here <https://skypilot.readthedocs.io/en/latest/reference/yaml-spec.html>`__.
+
+.. note::
+  These settings will not take effect if you have an existing controller (either
+  stopped or live).  For them to take effect, tear down the existing controller
+  first, which requires all in-progress jobs to finish or be canceled.
+
diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst
index 5940e404bb3..2b3df600425 100644
--- a/docs/source/examples/spot-jobs.rst
+++ b/docs/source/examples/spot-jobs.rst
@@ -1,389 +1,23 @@
-.. _spot-jobs:
-
 Managed Spot Jobs
-================================================
-
-.. tip::
-
-  This feature is great for scaling out: running a single job for long durations, or running many jobs.
-
-SkyPilot supports managed spot jobs that can **automatically recover from preemptions**.
-This feature **saves significant cost** (e.g., up to 70\% for GPU VMs) by making preemptible spot instances practical for long-running jobs.
-
-SkyPilot automatically finds available spot resources across regions and clouds to maximize availability.
-Here is an example of a BERT training job failing over different regions across AWS and GCP.
-
-.. image:: https://i.imgur.com/Vteg3fK.gif
-  :width: 600
-  :alt: GIF for BERT training on Spot V100
-
-.. image:: ../images/spot-training.png
-  :width: 600
-  :alt: Static plot, BERT training on Spot V100
-
-To use managed spot jobs, there are two requirements:
-
-#. **Task YAML**: Managed Spot requires a YAML to describe the job, tested with :code:`sky launch`.
-#. **Checkpointing** (optional): For job recovery due to preemptions, the user application code can checkpoint its progress periodically to a :ref:`mounted cloud bucket <sky-storage>`. The program can reload the latest checkpoint when restarted.
-
-
-Task YAML
----------
-
-To launch a spot job, you can simply reuse your task YAML (recommended to test it with :code:`sky launch` first).
-For example, we found the BERT fine-tuning YAML works with :code:`sky launch`, and want to
-launch it with SkyPilot managed spot jobs.
-
-We can launch it with the following:
-
-.. code-block:: console
-
-  $ sky spot launch -n bert-qa bert_qa.yaml
-
-
-.. code-block:: yaml
-
-  # bert_qa.yaml
-  name: bert-qa
-
-  resources:
-    accelerators: V100:1
-
-  # Assume your working directory is under `~/transformers`.
-  # To make this example work, please run the following command:
-  # git clone https://github.com/huggingface/transformers.git ~/transformers -b v4.30.1
-  workdir: ~/transformers
-
-  setup: |
-    # Fill in your wandb key: copy from https://wandb.ai/authorize
-    # Alternatively, you can use `--env WANDB_API_KEY=$WANDB_API_KEY`
-    # to pass the key in the command line, during `sky spot launch`.
-    echo export WANDB_API_KEY=[YOUR-WANDB-API-KEY] >> ~/.bashrc
-
-    pip install -e .
-    cd examples/pytorch/question-answering/
-    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
-    pip install wandb
-
-  run: |
-    cd ./examples/pytorch/question-answering/
-    python run_qa.py \
-    --model_name_or_path bert-base-uncased \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 12 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 50 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --report_to wandb
-
-
-.. note::
-
-  :ref:`workdir <sync-code-artifacts>` and :ref:`file mounts with local files <sync-code-artifacts>` will be automatically uploaded to a
-  :ref:`cloud bucket <sky-storage>`. The bucket will be created during the job running time, and cleaned up after the job
-  finishes.
-
-SkyPilot will launch and start monitoring the spot job. When a preemption happens, SkyPilot will automatically
-search for resources across regions and clouds to re-launch the job.
-
-In this example, the job will be restarted from scratch after each preemption recovery.
-To resume the job from previous states, user's application needs to implement checkpointing and recovery.
-
-
-Checkpointing and recovery
---------------------------
-
-To allow spot recovery, a cloud bucket is typically needed to store the job's states (e.g., model checkpoints).
-Below is an example of mounting a bucket to :code:`/checkpoint`.
-
-.. code-block:: yaml
-
-  file_mounts:
-    /checkpoint:
-      name: # NOTE: Fill in your bucket name
-      mode: MOUNT
-
-The :code:`MOUNT` mode in :ref:`SkyPilot bucket mounting <sky-storage>` ensures the checkpoints outputted to :code:`/checkpoint` are automatically synced to a persistent bucket.
-Note that the application code should save program checkpoints periodically and reload those states when the job is restarted.
-This is typically achieved by reloading the latest checkpoint at the beginning of your program.
-
-.. _spot-jobs-end-to-end:
-
-An end-to-end example
----------------------
-
-Below we show an `example <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/bert_qa.yaml>`_ for fine-tuning a BERT model on a question-answering task with HuggingFace.
-
-.. code-block:: yaml
-  :emphasize-lines: 12-15,41-44
-
-  # bert_qa.yaml
-  name: bert-qa
-
-  resources:
-    accelerators: V100:1
-
-  # Assume your working directory is under `~/transformers`.
-  # To make this example work, please run the following command:
-  # git clone https://github.com/huggingface/transformers.git ~/transformers -b v4.30.1
-  workdir: ~/transformers
-
-  file_mounts:
-    /checkpoint:
-      name: # NOTE: Fill in your bucket name
-      mode: MOUNT
-
-  setup: |
-    # Fill in your wandb key: copy from https://wandb.ai/authorize
-    # Alternatively, you can use `--env WANDB_API_KEY=$WANDB_API_KEY`
-    # to pass the key in the command line, during `sky spot launch`.
-    echo export WANDB_API_KEY=[YOUR-WANDB-API-KEY] >> ~/.bashrc
-
-    pip install -e .
-    cd examples/pytorch/question-answering/
-    pip install -r requirements.txt
-    pip install wandb
-
-  run: |
-    cd ./examples/pytorch/question-answering/
-    python run_qa.py \
-    --model_name_or_path bert-base-uncased \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 12 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 50 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --report_to wandb \
-    --run_name $SKYPILOT_TASK_ID \
-    --output_dir /checkpoint/bert_qa/ \
-    --save_total_limit 10 \
-    --save_steps 1000
-
-
-
-As HuggingFace has built-in support for periodically checkpointing, we only need to pass the highlighted arguments for setting up
-the output directory and frequency of checkpointing (see more
-on `Huggingface API <https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_steps>`_).
-You may also refer to another example `here <https://github.com/skypilot-org/skypilot/tree/master/examples/spot/resnet_ddp>`__ for periodically checkpointing with PyTorch.
-
-We also set :code:`--run_name` to :code:`$SKYPILOT_TASK_ID` so that the logs for all recoveries of the same job will be saved
-to the same run in Weights & Biases.
-
-.. note::
-  The environment variable :code:`$SKYPILOT_TASK_ID` (example: "sky-managed-2022-10-06-05-17-09-750781_pipeline_eval_8-1") can be used to identify the same job, i.e., it is kept identical across all
-  recoveries of the job.
-  It can be accessed in the task's :code:`run` commands or directly in the program itself (e.g., access
-  via :code:`os.environ` and pass to Weights & Biases for tracking purposes in your training script). It is made available to
-  the task whenever it is invoked.
-
-With the highlighted changes, the managed spot job can now resume training after preemption with ``sky spot launch``! We can enjoy the benefits of
-cost savings from spot instances without worrying about preemption or losing progress.
-
-.. code-block:: console
-
-  $ sky spot launch -n bert-qa bert_qa.yaml
-
-.. tip::
-
-  Try copy-paste this example and adapt it to your own job.
-
-
-Useful CLIs
------------
-
-Here are some commands for managed spot jobs. Check :code:`sky spot --help` for more details.
-
-See all spot jobs:
-
-.. code-block:: console
-
-  $ sky spot queue
-
-.. code-block:: console
-
-  Fetching managed spot job statuses...
-  Managed spot jobs:
-  ID NAME     RESOURCES     SUBMITTED   TOT. DURATION   JOB DURATION   #RECOVERIES  STATUS
-  2  roberta  1x [A100:8]   2 hrs ago   2h 47m 18s      2h 36m 18s     0            RUNNING
-  1  bert-qa  1x [V100:1]   4 hrs ago   4h 24m 26s      4h 17m 54s     0            RUNNING
-
-Stream the logs of a running spot job:
-
-.. code-block:: console
-
-  $ sky spot logs -n bert-qa  # by name
-  $ sky spot logs 2           # by job ID
-
-Cancel a spot job:
-
-.. code-block:: console
-
-  $ sky spot cancel -n bert-qa  # by name
-  $ sky spot cancel 2           # by job ID
-
-.. note::
-  If any failure happens for a spot job, you can check :code:`sky spot queue -a` for the brief reason
-  of the failure. For more details, it would be helpful to check :code:`sky spot logs --controller <job_id>`.
-
-Dashboard
------------
-
-Use ``sky spot dashboard`` to open a dashboard to see all jobs:
-
-.. code-block:: console
-
-  $ sky spot dashboard
-
-This automatically opens a browser tab to show the dashboard:
-
-.. image:: ../images/spot-dashboard.png
-
-The UI shows the same information as the CLI ``sky spot queue -a``. The UI is
-especially useful when there are many in-progress jobs to monitor, which the
-terminal-based CLI may need more than one page to display.
-
-Real-world examples
--------------------------
-
-* `Vicuna <https://vicuna.lmsys.org/>`_ LLM chatbot: `instructions <https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna>`_, `YAML <https://github.com/skypilot-org/skypilot/blob/master/llm/vicuna/train.yaml>`__
-* BERT (shown above): `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/bert_qa.yaml>`__
-* PyTorch DDP, ResNet: `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/resnet.yaml>`__
-* PyTorch Lightning DDP, CIFAR-10: `YAML <https://github.com/skypilot-org/skypilot/blob/master/examples/spot/lightning_cifar10.yaml>`__
-
-Spot controller
--------------------------------
-
-The spot controller is a small on-demand CPU VM running in the cloud that manages all spot jobs of a user.
-It is automatically launched when the first managed spot job is submitted, and it is autostopped after it has been idle for 10 minutes (i.e., after all spot jobs finish and no new spot job is submitted in that duration).
-Thus, **no user action is needed** to manage its lifecycle.
-
-You can see the controller with :code:`sky status` and refresh its status by using the :code:`-r/--refresh` flag.
-
-While the cost of the spot controller is negligible (~$0.4/hour when running and less than $0.004/hour when stopped),
-you can still tear it down manually with
-:code:`sky down <spot-controller-name>`, where the ``<spot-controller-name>`` can be found in the output of :code:`sky status`.
-
-.. note::
-  Tearing down the spot controller loses all logs and status information for the finished spot jobs. It is only allowed when there are no in-progress spot jobs to ensure no resource leakage.
-
-Customizing spot controller resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You may want to customize the resources of the spot controller for several reasons:
-
-1. Use a lower-cost controller (if you have a low number of concurrent spot jobs).
-2. Enforcing the spot controller to run on a specific location. (Default: cheapest location)
-3. Changing the maximum number of spot jobs that can be run concurrently, which is 2x the vCPUs of the controller. (Default: 16)
-4. Changing the disk_size of the spot controller to store more logs. (Default: 50GB)
-
-To achieve the above, you can specify custom configs in :code:`~/.sky/config.yaml` with the following fields:
-
-.. code-block:: yaml
-
-  spot:
-    # NOTE: these settings only take effect for a new spot controller, not if
-    # you have an existing one.
-    controller:
-      resources:
-        # All configs below are optional.
-        # Specify the location of the spot controller.
-        cloud: gcp
-        region: us-central1
-        # Specify the maximum number of spot jobs that can be run concurrently.
-        cpus: 4+  # number of vCPUs, max concurrent spot jobs = 2 * cpus
-        # Specify the disk_size in GB of the spot controller.
-        disk_size: 100
-
-The :code:`resources` field has the same spec as a normal SkyPilot job; see `here <https://skypilot.readthedocs.io/en/latest/reference/yaml-spec.html>`__.
-
-.. note::
-  These settings will not take effect if you have an existing controller (either
-  stopped or live).  For them to take effect, tear down the existing controller
-  first, which requires all in-progress spot jobs to finish or be canceled.
-
-
-Spot Pipeline
--------------------------
-
-Spot Pipeline is a feature that allows you to submit a spot job that contains a sequence of spot tasks running one after another.
-This is useful for running a sequence of jobs that depend on each other, e.g., training a model and then running inference on it.
-This allows the multiple tasks to have different resource requirements to fully utilize the resources and save cost, while keeping the burden of managing the tasks off the user. 
-
-.. note::
-  A spot job is either a single task or a pipeline of tasks. A spot job is submitted by :code:`sky spot launch`.
-  
-  All tasks in a pipeline will be run on spot instances.
-
-To use Spot Pipeline, you can specify the sequence of jobs in a YAML file. Here is an example:
-
-.. code-block:: yaml
-
-  name: pipeline
-
-  ---
-  
-  name: train
-
-  resources:
-    accelerators: V100:8
-
-  file_mounts:
-    /checkpoint:
-      name: train-eval # NOTE: Fill in your bucket name
-      mode: MOUNT
-
-  setup: |
-    echo setup for training
-
-  run: |
-    echo run for training
-    echo save checkpoints to /checkpoint
-
-  ---
-
-  name: eval
-
-  resources:
-    accelerators: T4:1
-
-  file_mounts:
-    /checkpoint:
-      name: train-eval # NOTE: Fill in your bucket name
-      mode: MOUNT
-
-  setup: |
-    echo setup for eval
-
-  run: |
-    echo load trained model from /checkpoint
-    echo eval model on test set
-
-
-The above YAML file defines a pipeline with two tasks. The first :code:`name: pipeline` names the pipeline. The first task has name :code:`train` and the second task has name :code:`eval`. The tasks are separated by a line with three dashes :code:`---`. Each task has its own :code:`resources`, :code:`setup`, and :code:`run` sections. The :code:`setup` and :code:`run` sections are executed sequentially.
-
-To submit the pipeline, the same command :code:`sky spot launch` is used. The pipeline will be automatically launched and monitored by SkyPilot. You can check the status of the pipeline with :code:`sky spot queue` or :code:`sky spot dashboard`.
-
-.. note::
-
-  The :code:`$SKYPILOT_TASK_ID` environment variable is also available in the :code:`run` section of each task. It is unique for each task in the pipeline.
-  For example, the :code:`$SKYPILOT_TASK_ID` for the :code:`eval` task above is:
-  "sky-managed-2022-10-06-05-17-09-750781_pipeline_eval_8-1".
-
-.. code-block:: console
-
-  $ sky spot launch -n pipeline pipeline.yaml
-  $ sky spot queue
-  Fetching managed spot job statuses...
-  Managed spot jobs
-  In progress tasks: 1 PENDING, 1 RECOVERING
-  ID  TASK  NAME           RESOURCES        SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS     
-  8         pipeline       -                50 mins ago  47m 45s        -             1            RECOVERING   
-   ↳  0     train          1x [V100:8]      50 mins ago  47m 45s        -             1            RECOVERING 
-   ↳  1     eval           1x [T4:1]        -            -              -             0            PENDING 
-
+==================
+
+.. raw:: html
+
+    <script type="text/javascript">
+        // Function to perform the replacement and redirection
+        function redirectToManagedJobs() {
+            var currentUrl = window.location.href;
+            
+            // Check if the URL contains 'spot-jobs.html'
+            if (currentUrl.includes("spot-jobs.html")) {
+                // Replace 'spot-jobs.html' with 'managed-jobs.html'
+                var newUrl = currentUrl.replace("spot-jobs.html", "managed-jobs.html");
+                
+                // Redirect to the new URL
+                window.location.href = newUrl;
+            }
+        }
+
+        // Call the redirection function on page load
+        redirectToManagedJobs();
+    </script>
diff --git a/docs/source/images/job-dashboard.png b/docs/source/images/job-dashboard.png
new file mode 100644
index 00000000000..6c25d7d83bc
Binary files /dev/null and b/docs/source/images/job-dashboard.png differ
diff --git a/docs/source/images/managed-jobs-arch.png b/docs/source/images/managed-jobs-arch.png
new file mode 100644
index 00000000000..c78f0680331
Binary files /dev/null and b/docs/source/images/managed-jobs-arch.png differ
diff --git a/docs/source/images/spot-controller.png b/docs/source/images/spot-controller.png
deleted file mode 100644
index fce9a1d8cc2..00000000000
Binary files a/docs/source/images/spot-controller.png and /dev/null differ
diff --git a/docs/source/images/spot-dashboard.png b/docs/source/images/spot-dashboard.png
deleted file mode 100644
index 2b322418350..00000000000
Binary files a/docs/source/images/spot-dashboard.png and /dev/null differ
diff --git a/docs/source/reference/cli.rst b/docs/source/reference/cli.rst
index d4c45ce7b82..985f63482b6 100644
--- a/docs/source/reference/cli.rst
+++ b/docs/source/reference/cli.rst
@@ -3,8 +3,8 @@
 Command Line Interface
 ======================
 
-Core CLI
----------
+Cluster CLI
+-----------
 
 .. _sky-launch:
 .. click:: sky.cli:launch
@@ -41,9 +41,6 @@ Core CLI
    :prog: sky autostop
    :nested: full
 
-Job Queue CLI
---------------
-
 .. _sky-queue:
 .. click:: sky.cli:queue
    :prog: sky queue
@@ -59,7 +56,31 @@ Job Queue CLI
    :prog: sky cancel
    :nested: full
 
-Sky Serve CLI
+Managed (Spot) Jobs CLI
+---------------------------
+
+.. _sky-job-launch:
+.. click:: sky.cli:jobs_launch
+   :prog: sky jobs launch
+   :nested: full
+
+.. _sky-job-queue:
+.. click:: sky.cli:jobs_queue
+   :prog: sky jobs queue
+   :nested: full
+
+.. _sky-job-cancel:
+.. click:: sky.cli:jobs_cancel
+   :prog: sky jobs cancel
+   :nested: full
+
+.. _sky-job-logs:
+.. click:: sky.cli:jobs_logs
+   :prog: sky jobs logs
+   :nested: full
+
+
+SkyServe CLI
 -------------
 
 .. click:: sky.cli:serve_up
@@ -82,28 +103,6 @@ Sky Serve CLI
    :prog: sky serve update
    :nested: full
 
-Managed Spot Jobs CLI
----------------------------
-
-.. _sky-spot-launch:
-.. click:: sky.cli:spot_launch
-   :prog: sky spot launch
-   :nested: full
-
-.. _sky-spot-queue:
-.. click:: sky.cli:spot_queue
-   :prog: sky spot queue
-   :nested: full
-
-.. _sky-spot-cancel:
-.. click:: sky.cli:spot_cancel
-   :prog: sky spot cancel
-   :nested: full
-
-.. _sky-spot-logs:
-.. click:: sky.cli:spot_logs
-   :prog: sky spot logs
-   :nested: full
 
 Storage CLI
 ------------
diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst
index 9bca0a796d7..1dfda834ee0 100644
--- a/docs/source/reference/config.rst
+++ b/docs/source/reference/config.rst
@@ -14,12 +14,12 @@ Available fields and semantics:
 
 .. code-block:: yaml
 
-  # Custom spot controller resources (optional).
+  # Custom managed jobs controller resources (optional).
   #
-  # These take effects only when a spot controller does not already exist.
+  # These take effects only when a managed jobs controller does not already exist.
   #
-  # Ref: https://skypilot.readthedocs.io/en/latest/examples/spot-jobs.html#customizing-spot-controller-resources
-  spot:
+  # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources
+  jobs:
     controller:
       resources:  # same spec as 'resources' in a task YAML
         cloud: gcp
@@ -114,7 +114,7 @@ Available fields and semantics:
     # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
     # AWS instances created by SkyPilot. They are used for accessing cloud
     # resources (e.g., private buckets) or launching new instances (e.g., for
-    # spot/serve controllers).
+    # jobs/serve controllers).
     #
     # SERVICE_ACCOUNT: Local credential files are not uploaded to AWS
     # instances. SkyPilot will auto-create and reuse a service account (IAM
@@ -125,8 +125,8 @@ Available fields and semantics:
     # - This only affects AWS instances. Local AWS credentials will still be
     #   uploaded to non-AWS instances (since those instances may need to access
     #   AWS resources).
-    # - If the SkyPilot spot/serve controller is on AWS, this setting will make
-    #   non-AWS managed spot jobs / non-AWS service replicas fail to access any
+    # - If the SkyPilot jobs/serve controller is on AWS, this setting will make
+    #   non-AWS managed jobs / non-AWS service replicas fail to access any
     #   resources on AWS (since the controllers don't have AWS credential
     #   files to assign to these non-AWS instances).
     #
@@ -224,7 +224,7 @@ Available fields and semantics:
     # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
     # GCP instances created by SkyPilot. They are used for accessing cloud
     # resources (e.g., private buckets) or launching new instances (e.g., for
-    # spot/serve controllers).
+    # jobs/serve controllers).
     #
     # SERVICE_ACCOUNT: Local credential files are not uploaded to GCP
     # instances. SkyPilot will auto-create and reuse a service account for GCP
@@ -235,8 +235,8 @@ Available fields and semantics:
     # - This only affects GCP instances. Local GCP credentials will still be
     #   uploaded to non-GCP instances (since those instances may need to access
     #   GCP resources).
-    # - If the SkyPilot spot/serve controller is on GCP, this setting will make
-    #   non-GCP managed spot jobs / non-GCP service replicas fail to access any
+    # - If the SkyPilot jobs/serve controller is on GCP, this setting will make
+    #   non-GCP managed jobs / non-GCP service replicas fail to access any
     #   resources on GCP (since the controllers don't have GCP credential
     #   files to assign to these non-GCP instances).
     #
diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst
index 47be2012365..6397c7bbbb6 100644
--- a/docs/source/reference/job-queue.rst
+++ b/docs/source/reference/job-queue.rst
@@ -1,7 +1,7 @@
 .. _job-queue:
 
-Job Queue
-=========
+Cluster Job Queue
+=================
 
 SkyPilot's **job queue** allows multiple jobs to be scheduled on a cluster.
 
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
index 14b5d428d42..1e56240989c 100644
--- a/docs/source/reference/yaml-spec.rst
+++ b/docs/source/reference/yaml-spec.rst
@@ -92,10 +92,21 @@ Available fields:
       # If unspecified, defaults to False (on-demand instances).
       use_spot: False
 
-      # The recovery strategy for spot jobs (optional).
-      # `use_spot` must be True for this to have any effect. For now, only
-      # `FAILOVER` strategy is supported.
-      spot_recovery: none
+      # The recovery strategy for managed jobs (optional).
+      #
+      # In effect for managed jobs. Possible values are `FAILOVER` and `EAGER_NEXT_REGION`.
+      #
+      # If `FAILOVER` is specified, the job will be restarted in the same region
+      # if the node fails, and go to the next region if no available resources
+      # are found in the same region. 
+      #
+      # If `EAGER_NEXT_REGION` is specified, the job will go to the next region
+      # directly if the node fails. This is useful for spot instances, as in
+      # practice, preemptions in a region usually indicate a shortage of resources
+      # in that region.
+      #
+      # default: EAGER_NEXT_REGION
+      job_recovery: none
 
       # Disk size in GB to allocate for OS (mounted at /). Increase this if you
       # have a large working directory or tasks that write out large outputs.
diff --git a/examples/managed_job.yaml b/examples/managed_job.yaml
new file mode 100644
index 00000000000..4bfcb63f40a
--- /dev/null
+++ b/examples/managed_job.yaml
@@ -0,0 +1,16 @@
+name: minimal
+
+setup: |
+  echo "running setup"
+  pip install tqdm
+
+run: |
+  conda env list
+  python -u - << EOF
+  import time
+  import tqdm
+
+  for i in tqdm.trange(240):
+    time.sleep(1)
+  
+  EOF
diff --git a/examples/managed_spot_with_storage.yaml b/examples/managed_job_with_storage.yaml
similarity index 83%
rename from examples/managed_spot_with_storage.yaml
rename to examples/managed_job_with_storage.yaml
index 1b81e459bb4..ecefccd8b3d 100644
--- a/examples/managed_spot_with_storage.yaml
+++ b/examples/managed_job_with_storage.yaml
@@ -3,13 +3,13 @@
 # Runs a task that uses cloud buckets for uploading and accessing files.
 #
 # Usage:
-#   sky spot launch -c spot-storage examples/managed_spot_with_storage.yaml
+#   sky spot launch -c spot-storage examples/managed_job_with_storage.yaml
 #   sky down spot-storage
 
 resources:
   cloud: aws
   use_spot: true
-  spot_recovery: failover
+  job_recovery: failover
 
 workdir: ./examples
 
@@ -41,8 +41,8 @@ file_mounts:
 
 run: |
   set -ex
-  ls ~/sky_workdir/managed_spot_with_storage.yaml
-  ls ~/bucket_workdir/managed_spot_with_storage.yaml
+  ls ~/sky_workdir/managed_job_with_storage.yaml
+  ls ~/bucket_workdir/managed_job_with_storage.yaml
   ls -l /imagenet-image/datasets
   
 
diff --git a/examples/managed_spot.yaml b/examples/managed_spot.yaml
index 4bfcb63f40a..712819eb9ca 100644
--- a/examples/managed_spot.yaml
+++ b/examples/managed_spot.yaml
@@ -1,5 +1,8 @@
 name: minimal
 
+resources:
+  use_spot: true
+
 setup: |
   echo "running setup"
   pip install tqdm
diff --git a/sky/__init__.py b/sky/__init__.py
index d25c8297ea5..a077fb8966a 100644
--- a/sky/__init__.py
+++ b/sky/__init__.py
@@ -102,16 +102,16 @@ def set_proxy_env_var(proxy_var: str, urllib_var: Optional[str]):
 from sky.data import StoreType
 from sky.execution import exec  # pylint: disable=redefined-builtin
 from sky.execution import launch
+# TODO (zhwu): These imports are for backward compatibility, and spot APIs
+# should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
+from sky.jobs.core import spot_cancel
+from sky.jobs.core import spot_launch
+from sky.jobs.core import spot_queue
+from sky.jobs.core import spot_tail_logs
 from sky.optimizer import Optimizer
 from sky.optimizer import OptimizeTarget
 from sky.resources import Resources
 from sky.skylet.job_lib import JobStatus
-# TODO (zhwu): These imports are for backward compatibility, and spot APIs
-# should be called with `sky.spot.xxx` instead. Remove in release 0.7.0
-from sky.spot.core import spot_cancel
-from sky.spot.core import spot_launch
-from sky.spot.core import spot_queue
-from sky.spot.core import spot_tail_logs
 from sky.status_lib import ClusterStatus
 from sky.task import Task
 
diff --git a/sky/authentication.py b/sky/authentication.py
index 7b5699d3337..581fdc12c7f 100644
--- a/sky/authentication.py
+++ b/sky/authentication.py
@@ -15,7 +15,7 @@
 The local machine's public key should not be uploaded to the
 `~/.ssh/sky-key.pub` on the remote VM, because it will cause private/public
 key pair mismatch when the user tries to launch new VM from that remote VM
-using SkyPilot, e.g., the node is used as a spot controller. (Lambda cloud
+using SkyPilot, e.g., the node is used as a jobs controller. (Lambda cloud
 is an exception, due to the limitation of the cloud provider. See the
 comments in setup_lambda_authentication)
 """
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index 5aed22b05ed..fecbcaad0b8 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -2239,18 +2239,18 @@ def check_cluster_available(
 
 # TODO(tian): Refactor to controller_utils. Current blocker: circular import.
 def is_controller_accessible(
-    controller_type: controller_utils.Controllers,
+    controller: controller_utils.Controllers,
     stopped_message: str,
     non_existent_message: Optional[str] = None,
     exit_if_not_accessible: bool = False,
 ) -> 'backends.CloudVmRayResourceHandle':
-    """Check if the spot/serve controller is up.
+    """Check if the jobs/serve controller is up.
 
     The controller is accessible when it is in UP or INIT state, and the ssh
     connection is successful.
 
     It can be used to check if the controller is accessible (since the autostop
-    is set for the controller) before the spot/serve commands interact with the
+    is set for the controller) before the jobs/serve commands interact with the
     controller.
 
     ClusterNotUpError will be raised whenever the controller cannot be accessed.
@@ -2274,10 +2274,8 @@ def is_controller_accessible(
           failed to be connected.
     """
     if non_existent_message is None:
-        non_existent_message = (
-            controller_type.value.default_hint_if_non_existent)
-    cluster_name = controller_type.value.cluster_name
-    controller_name = controller_type.value.name.replace(' controller', '')
+        non_existent_message = controller.value.default_hint_if_non_existent
+    cluster_name = controller.value.cluster_name
     need_connection_check = False
     controller_status, handle = None, None
     try:
@@ -2299,7 +2297,7 @@ def is_controller_accessible(
         # will not start the controller manually from the cloud console.
         #
         # The acquire_lock_timeout is set to 0 to avoid hanging the command when
-        # multiple spot.launch commands are running at the same time. Our later
+        # multiple jobs.launch commands are running at the same time. Our later
         # code will check if the controller is accessible by directly checking
         # the ssh connection to the controller, if it fails to get accurate
         # status of the controller.
@@ -2311,6 +2309,7 @@ def is_controller_accessible(
         # We do not catch the exceptions related to the cluster owner identity
         # mismatch, please refer to the comment in
         # `backend_utils.check_cluster_available`.
+        controller_name = controller.value.name.replace(' controller', '')
         logger.warning(
             'Failed to get the status of the controller. It is not '
             f'fatal, but {controller_name} commands/calls may hang or return '
@@ -2336,7 +2335,7 @@ def is_controller_accessible(
     elif (controller_status == status_lib.ClusterStatus.INIT or
           need_connection_check):
         # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
-        # status, both of which can happen when controller's status lock is held by another `sky spot launch` or
+        # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
         # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
         # we can allow access to the controller.
         ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
@@ -2347,7 +2346,7 @@ def is_controller_accessible(
                                                  **ssh_credentials,
                                                  port=handle.head_ssh_port)
         if not runner.check_connection():
-            error_msg = controller_type.value.connection_error_hint
+            error_msg = controller.value.connection_error_hint
     else:
         assert controller_status == status_lib.ClusterStatus.UP, handle
 
@@ -2386,7 +2385,7 @@ def get_clusters(
     of the clusters.
 
     Args:
-        include_controller: Whether to include controllers, e.g. spot controller
+        include_controller: Whether to include controllers, e.g. jobs controller
             or sky serve controller.
         refresh: Whether to refresh the status of the clusters. (Refreshing will
             set the status to STOPPED if the cluster cannot be pinged.)
@@ -2546,8 +2545,8 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]:
         optionally accelerator demands.
     """
     # TODO: Custom CPU and other memory resources are not supported yet.
-    # For sky spot/serve controller task, we set the CPU resource to a smaller
-    # value to support a larger number of spot jobs and services.
+    # For sky jobs/serve controller task, we set the CPU resource to a smaller
+    # value to support a larger number of managed jobs and services.
     resources_dict = {
         'CPU': (constants.CONTROLLER_PROCESS_CPU_DEMAND
                 if task.is_controller_task() else DEFAULT_TASK_CPU_DEMAND)
@@ -2564,41 +2563,58 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]:
     return resources_dict
 
 
-def get_task_resources_str(task: 'task_lib.Task') -> str:
+def get_task_resources_str(task: 'task_lib.Task',
+                           is_managed_job: bool = False) -> str:
     """Returns the resources string of the task.
 
     The resources string is only used as a display purpose, so we only show
     the accelerator demands (if any). Otherwise, the CPU demand is shown.
     """
-    task_cpu_demand = (constants.CONTROLLER_PROCESS_CPU_DEMAND if
-                       task.is_controller_task() else DEFAULT_TASK_CPU_DEMAND)
+    spot_str = ''
+    task_cpu_demand = (str(constants.CONTROLLER_PROCESS_CPU_DEMAND)
+                       if task.is_controller_task() else
+                       str(DEFAULT_TASK_CPU_DEMAND))
     if task.best_resources is not None:
         accelerator_dict = task.best_resources.accelerators
+        if is_managed_job:
+            if task.best_resources.use_spot:
+                spot_str = '[Spot]'
+            task_cpu_demand = task.best_resources.cpus
         if accelerator_dict is None:
             resources_str = f'CPU:{task_cpu_demand}'
         else:
             resources_str = ', '.join(
                 f'{k}:{v}' for k, v in accelerator_dict.items())
-    elif len(task.resources) == 1:
-        resources_dict = list(task.resources)[0].accelerators
-        if resources_dict is None:
-            resources_str = f'CPU:{task_cpu_demand}'
-        else:
-            resources_str = ', '.join(
-                f'{k}:{v}' for k, v in resources_dict.items())
     else:
         resource_accelerators = []
+        min_cpus = float('inf')
+        spot_type: Set[str] = set()
         for resource in task.resources:
+            task_cpu_demand = '1+'
+            if resource.cpus is not None:
+                task_cpu_demand = resource.cpus
+            min_cpus = min(min_cpus, float(task_cpu_demand.strip('+ ')))
+            if resource.use_spot:
+                spot_type.add('Spot')
+            else:
+                spot_type.add('On-demand')
+
             if resource.accelerators is None:
                 continue
             for k, v in resource.accelerators.items():
                 resource_accelerators.append(f'{k}:{v}')
 
+        if is_managed_job:
+            if len(task.resources) > 1:
+                task_cpu_demand = f'{min_cpus}+'
+            if 'Spot' in spot_type:
+                spot_str = '|'.join(sorted(spot_type))
+                spot_str = f'[{spot_str}]'
         if resource_accelerators:
             resources_str = ', '.join(set(resource_accelerators))
         else:
             resources_str = f'CPU:{task_cpu_demand}'
-    resources_str = f'{task.num_nodes}x [{resources_str}]'
+    resources_str = f'{task.num_nodes}x[{resources_str}]{spot_str}'
     return resources_str
 
 
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 3196c45da55..f916d931b5f 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -28,12 +28,12 @@
 from sky import clouds
 from sky import exceptions
 from sky import global_user_state
+from sky import jobs as managed_jobs
 from sky import optimizer
 from sky import provision as provision_lib
 from sky import resources as resources_lib
 from sky import serve as serve_lib
 from sky import sky_logging
-from sky import spot as spot_lib
 from sky import status_lib
 from sky import task as task_lib
 from sky.backends import backend_utils
@@ -3115,7 +3115,7 @@ def _exec_code_on_head(
         codegen: str,
         job_id: int,
         detach_run: bool = False,
-        spot_dag: Optional['dag.Dag'] = None,
+        managed_job_dag: Optional['dag.Dag'] = None,
     ) -> None:
         """Executes generated code on the head node."""
         style = colorama.Style
@@ -3145,22 +3145,24 @@ def _exec_code_on_head(
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
 
-        if spot_dag is not None:
-            # Add the spot job to spot queue table.
-            spot_codegen = spot_lib.SpotCodeGen()
-            spot_code = spot_codegen.set_pending(job_id, spot_dag)
-            # Set the spot job to PENDING state to make sure that this spot
-            # job appears in the `sky spot queue`, when there are already 16
-            # controller process jobs running on the controller VM with 8
-            # CPU cores.
-            # The spot job should be set to PENDING state *after* the
+        if managed_job_dag is not None:
+            # Add the managed job to job queue database.
+            managed_job_codegen = managed_jobs.ManagedJobCodeGen()
+            managed_job_code = managed_job_codegen.set_pending(
+                job_id, managed_job_dag)
+            # Set the managed job to PENDING state to make sure that this
+            # managed job appears in the `sky jobs queue`, when there are
+            # already 2x vCPU controller processes running on the controller VM,
+            # e.g., 16 controller processes running on a controller with 8
+            # vCPUs.
+            # The managed job should be set to PENDING state *after* the
             # controller process job has been queued, as our skylet on spot
-            # controller will set the spot job in FAILED state if the
+            # controller will set the managed job in FAILED state if the
             # controller process job does not exist.
-            # We cannot set the spot job to PENDING state in the codegen for
+            # We cannot set the managed job to PENDING state in the codegen for
             # the controller process job, as it will stay in the job pending
             # table and not be executed until there is an empty slot.
-            job_submit_cmd = job_submit_cmd + ' && ' + spot_code
+            job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
 
         returncode, stdout, stderr = self.run_on_head(handle,
                                                       job_submit_cmd,
@@ -3181,8 +3183,9 @@ def _exec_code_on_head(
 
         try:
             if not detach_run:
-                if handle.cluster_name == spot_lib.SPOT_CONTROLLER_NAME:
-                    self.tail_spot_logs(handle, job_id)
+                if (handle.cluster_name in controller_utils.Controllers.
+                        JOBS_CONTROLLER.value.candidate_cluster_names):
+                    self.tail_managed_job_logs(handle, job_id)
                 else:
                     # Sky logs. Not using subprocess.run since it will make the
                     # ssh keep connected after ctrl-c.
@@ -3190,24 +3193,24 @@ def _exec_code_on_head(
         finally:
             name = handle.cluster_name
             controller = controller_utils.Controllers.from_name(name)
-            if controller == controller_utils.Controllers.SPOT_CONTROLLER:
+            if controller == controller_utils.Controllers.JOBS_CONTROLLER:
                 logger.info(
-                    f'{fore.CYAN}Spot Job ID: '
+                    f'{fore.CYAN}Managed Job ID: '
                     f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
                     '\nTo cancel the job:\t\t'
-                    f'{backend_utils.BOLD}sky spot cancel {job_id}'
+                    f'{backend_utils.BOLD}sky jobs cancel {job_id}'
                     f'{backend_utils.RESET_BOLD}'
                     '\nTo stream job logs:\t\t'
-                    f'{backend_utils.BOLD}sky spot logs {job_id}'
+                    f'{backend_utils.BOLD}sky jobs logs {job_id}'
                     f'{backend_utils.RESET_BOLD}'
                     f'\nTo stream controller logs:\t'
-                    f'{backend_utils.BOLD}sky spot logs --controller {job_id}'
+                    f'{backend_utils.BOLD}sky jobs logs --controller {job_id}'
                     f'{backend_utils.RESET_BOLD}'
-                    '\nTo view all spot jobs:\t\t'
-                    f'{backend_utils.BOLD}sky spot queue'
+                    '\nTo view all managed jobs:\t'
+                    f'{backend_utils.BOLD}sky jobs queue'
                     f'{backend_utils.RESET_BOLD}'
-                    '\nTo view the spot job dashboard:\t'
-                    f'{backend_utils.BOLD}sky spot dashboard'
+                    '\nTo view managed job dashboard:\t'
+                    f'{backend_utils.BOLD}sky jobs dashboard'
                     f'{backend_utils.RESET_BOLD}')
             elif controller is None:
                 logger.info(f'{fore.CYAN}Job ID: '
@@ -3537,12 +3540,12 @@ def _rsync_down(args) -> None:
     def tail_logs(self,
                   handle: CloudVmRayResourceHandle,
                   job_id: Optional[int],
-                  spot_job_id: Optional[int] = None,
+                  managed_job_id: Optional[int] = None,
                   follow: bool = True) -> int:
         code = job_lib.JobLibCodeGen.tail_logs(job_id,
-                                               spot_job_id=spot_job_id,
+                                               managed_job_id=managed_job_id,
                                                follow=follow)
-        if job_id is None and spot_job_id is None:
+        if job_id is None and managed_job_id is None:
             logger.info(
                 'Job ID not provided. Streaming the logs of the latest job.')
 
@@ -3569,17 +3572,19 @@ def tail_logs(self,
             returncode = e.code
         return returncode
 
-    def tail_spot_logs(self,
-                       handle: CloudVmRayResourceHandle,
-                       job_id: Optional[int] = None,
-                       job_name: Optional[str] = None,
-                       follow: bool = True) -> None:
+    def tail_managed_job_logs(self,
+                              handle: CloudVmRayResourceHandle,
+                              job_id: Optional[int] = None,
+                              job_name: Optional[str] = None,
+                              follow: bool = True) -> None:
         # if job_name is not None, job_id should be None
         assert job_name is None or job_id is None, (job_name, job_id)
         if job_name is not None:
-            code = spot_lib.SpotCodeGen.stream_logs_by_name(job_name, follow)
+            code = managed_jobs.ManagedJobCodeGen.stream_logs_by_name(
+                job_name, follow)
         else:
-            code = spot_lib.SpotCodeGen.stream_logs_by_id(job_id, follow)
+            code = managed_jobs.ManagedJobCodeGen.stream_logs_by_id(
+                job_id, follow)
 
         # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
         # kill the process, so we need to handle it manually here.
@@ -4565,8 +4570,8 @@ def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
                            handle: CloudVmRayResourceHandle) -> Dict[str, str]:
         """Returns the environment variables for the task."""
         env_vars = task.envs.copy()
-        # If it is a managed spot job, the TASK_ID_ENV_VAR will have been
-        # already set by the controller.
+        # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
+        # by the controller.
         if constants.TASK_ID_ENV_VAR not in env_vars:
             env_vars[
                 constants.TASK_ID_ENV_VAR] = common_utils.get_global_job_id(
@@ -4618,7 +4623,7 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
                                 codegen.build(),
                                 job_id,
                                 detach_run=detach_run,
-                                spot_dag=task.spot_dag)
+                                managed_job_dag=task.managed_job_dag)
 
     def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
                               task: task_lib.Task, job_id: int,
@@ -4673,4 +4678,4 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
                                 codegen.build(),
                                 job_id,
                                 detach_run=detach_run,
-                                spot_dag=task.spot_dag)
+                                managed_job_dag=task.managed_job_dag)
diff --git a/sky/cli.py b/sky/cli.py
index 72667cffc97..485703e4caf 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -51,10 +51,10 @@
 from sky import core
 from sky import exceptions
 from sky import global_user_state
+from sky import jobs as managed_jobs
 from sky import provision as provision_lib
 from sky import serve as serve_lib
 from sky import sky_logging
-from sky import spot as spot_lib
 from sky import status_lib
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
@@ -91,9 +91,9 @@
 provision a new cluster with that name. Otherwise provision a new cluster with
 an autogenerated name."""
 
-# The maximum number of in-progress spot jobs to show in the status
+# The maximum number of in-progress managed jobs to show in the status
 # command.
-_NUM_SPOT_JOBS_TO_SHOW_IN_STATUS = 5
+_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
 
 _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
     '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -103,7 +103,7 @@
                             'please retry after a while.')
 
 _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
-                              '`sky spot launch`. `{command}` supports a '
+                              '`sky jobs launch`. `{command}` supports a '
                               'single task only.')
 
 
@@ -708,8 +708,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     ports: Optional[Tuple[str]] = None,
     env: Optional[List[Tuple[str, str]]] = None,
     field_to_ignore: Optional[List[str]] = None,
-    # spot launch specific
-    spot_recovery: Optional[str] = None,
+    # job launch specific
+    job_recovery: Optional[str] = None,
 ) -> Union[sky.Task, sky.Dag]:
     """Creates a task or a dag from an entrypoint with overrides.
 
@@ -777,9 +777,9 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     if workdir is not None:
         task.workdir = workdir
 
-    # Spot launch specific.
-    if spot_recovery is not None:
-        override_params['spot_recovery'] = spot_recovery
+    # job launch specific.
+    if job_recovery is not None:
+        override_params['job_recovery'] = job_recovery
 
     task.set_resources_override(override_params)
 
@@ -816,13 +816,30 @@ def get_help(self, ctx):
         return super().get_help(ctx)
 
 
-def _with_deprecation_warning(f, original_name, alias_name):
+def _with_deprecation_warning(
+        f,
+        original_name: str,
+        alias_name: str,
+        override_command_argument: Optional[Dict[str, Any]] = None):
 
     @functools.wraps(f)
     def wrapper(self, *args, **kwargs):
+        override_str = ''
+        if override_command_argument is not None:
+            overrides = []
+            for k, v in override_command_argument.items():
+                if isinstance(v, bool):
+                    if v:
+                        overrides.append(f'--{k}')
+                    else:
+                        overrides.append(f'--no-{k}')
+                else:
+                    overrides.append(f'--{k.replace("_", "-")}={v}')
+            override_str = ' with additional arguments ' + ' '.join(overrides)
         click.secho(
-            f'WARNING: `{alias_name}` is deprecated and will be removed in a '
-            f'future release. Please use `{original_name}` instead.\n',
+            f'WARNING: `{alias_name}` has been renamed to `{original_name}` '
+            f'and will be removed in a future release. Please use the '
+            f'latter{override_str} instead.\n',
             err=True,
             fg='yellow')
         return f(self, *args, **kwargs)
@@ -830,17 +847,49 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
-def _add_command_alias_to_group(group, command, name, hidden):
+def _override_arguments(callback, override_command_argument: Dict[str, Any]):
+
+    def wrapper(*args, **kwargs):
+        logger.info(f'Overriding arguments: {override_command_argument}')
+        kwargs.update(override_command_argument)
+        return callback(*args, **kwargs)
+
+    return wrapper
+
+
+def _add_command_alias(
+    group: click.Group,
+    command: click.Command,
+    hidden: bool = False,
+    new_group: Optional[click.Group] = None,
+    new_command_name: Optional[str] = None,
+    override_command_argument: Optional[Dict[str, Any]] = None,
+    with_warning: bool = True,
+) -> None:
     """Add a alias of a command to a group."""
+    if new_group is None:
+        new_group = group
+    if new_command_name is None:
+        new_command_name = command.name
+    if new_group == group and new_command_name == command.name:
+        raise ValueError('Cannot add an alias to the same command.')
     new_command = copy.deepcopy(command)
     new_command.hidden = hidden
-    new_command.name = name
+    new_command.name = new_command_name
+
+    if override_command_argument:
+        new_command.callback = _override_arguments(new_command.callback,
+                                                   override_command_argument)
 
     orig = f'sky {group.name} {command.name}'
-    alias = f'sky {group.name} {name}'
-    new_command.invoke = _with_deprecation_warning(new_command.invoke, orig,
-                                                   alias)
-    group.add_command(new_command, name=name)
+    alias = f'sky {new_group.name} {new_command_name}'
+    if with_warning:
+        new_command.invoke = _with_deprecation_warning(
+            new_command.invoke,
+            orig,
+            alias,
+            override_command_argument=override_command_argument)
+    new_group.add_command(new_command, name=new_command_name)
 
 
 def _deprecate_and_hide_command(group, command_to_deprecate,
@@ -1219,30 +1268,30 @@ def exec(
     sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
 
 
-def _get_spot_jobs(
+def _get_managed_jobs(
         refresh: bool,
         skip_finished: bool,
         show_all: bool,
         limit_num_jobs_to_show: bool = False,
         is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
-    """Get the in-progress spot jobs.
+    """Get the in-progress managed jobs.
 
     Args:
-        refresh: Query the latest statuses, restarting the spot controller if
+        refresh: Query the latest statuses, restarting the jobs controller if
             stopped.
         skip_finished: Show only in-progress jobs.
-        show_all: Show all information of each spot job (e.g., region, price).
+        show_all: Show all information of each job (e.g., region, price).
         limit_num_jobs_to_show: If True, limit the number of jobs to show to
-            _NUM_SPOT_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
+            _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
             `sky status`.
         is_called_by_user: If this function is called by user directly, or an
             internal call.
 
     Returns:
         A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
-        it means there is an error when querying the spot jobs. In this case,
+        it means there is an error when querying the managed jobs. In this case,
         msg contains the error message. Otherwise, msg contains the formatted
-        spot job table.
+        managed job table.
     """
     num_in_progress_jobs = None
     try:
@@ -1250,32 +1299,51 @@ def _get_spot_jobs(
             usage_lib.messages.usage.set_internal()
         with sky_logging.silent():
             # Make the call silent
-            spot_jobs = spot_lib.queue(refresh=refresh,
-                                       skip_finished=skip_finished)
-        num_in_progress_jobs = len(spot_jobs)
+            managed_jobs_ = managed_jobs.queue(refresh=refresh,
+                                               skip_finished=skip_finished)
+        num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
     except exceptions.ClusterNotUpError as e:
         controller_status = e.cluster_status
         msg = str(e)
         if controller_status is None:
-            msg += (f' (See: {colorama.Style.BRIGHT}sky spot -h'
+            msg += (f' (See: {colorama.Style.BRIGHT}sky jobs -h'
                     f'{colorama.Style.RESET_ALL})')
         elif (controller_status == status_lib.ClusterStatus.STOPPED and
               is_called_by_user):
-            msg += (f' (See finished jobs: {colorama.Style.BRIGHT}'
-                    f'sky spot queue --refresh{colorama.Style.RESET_ALL})')
+            msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}'
+                    f'sky jobs queue --refresh{colorama.Style.RESET_ALL})')
     except RuntimeError as e:
-        msg = ('Failed to query spot jobs due to connection '
-               'issues. Try again later. '
-               f'Details: {common_utils.format_exception(e, use_bracket=True)}')
+        msg = ''
+        try:
+            # Check the controller status again, as the RuntimeError is likely
+            # due to the controller being autostopped when querying the jobs.
+            controller_type = controller_utils.Controllers.JOBS_CONTROLLER
+            record = backend_utils.refresh_cluster_record(
+                controller_type.value.cluster_name,
+                cluster_status_lock_timeout=0)
+            if (record is None or
+                    record['status'] == status_lib.ClusterStatus.STOPPED):
+                msg = controller_type.value.default_hint_if_non_existent
+        except Exception:  # pylint: disable=broad-except
+            # This is to an best effort to find the latest controller status to
+            # print more helpful message, so we can ignore any exception to
+            # print the original error.
+            pass
+        if not msg:
+            msg = (
+                'Failed to query managed jobs due to connection '
+                'issues. Try again later. '
+                f'Details: {common_utils.format_exception(e, use_bracket=True)}'
+            )
     except Exception as e:  # pylint: disable=broad-except
-        msg = ('Failed to query spot jobs: '
+        msg = ('Failed to query managed jobs: '
                f'{common_utils.format_exception(e, use_bracket=True)}')
     else:
-        max_jobs_to_show = (_NUM_SPOT_JOBS_TO_SHOW_IN_STATUS
+        max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
                             if limit_num_jobs_to_show else None)
-        msg = spot_lib.format_job_table(spot_jobs,
-                                        show_all=show_all,
-                                        max_jobs=max_jobs_to_show)
+        msg = managed_jobs.format_job_table(managed_jobs_,
+                                            show_all=show_all,
+                                            max_jobs=max_jobs_to_show)
     return num_in_progress_jobs, msg
 
 
@@ -1314,9 +1382,27 @@ def _get_services(service_names: Optional[List[str]],
             msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h'
                     f'{colorama.Style.RESET_ALL})')
     except RuntimeError as e:
-        msg = ('Failed to fetch service statuses due to connection issues. '
-               'Please try again later. Details: '
-               f'{common_utils.format_exception(e, use_bracket=True)}')
+        msg = ''
+        try:
+            # Check the controller status again, as the RuntimeError is likely
+            # due to the controller being autostopped when querying the
+            # services.
+            controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
+            record = backend_utils.refresh_cluster_record(
+                controller_type.value.cluster_name,
+                cluster_status_lock_timeout=0)
+            if (record is None or
+                    record['status'] == status_lib.ClusterStatus.STOPPED):
+                msg = controller_type.value.default_hint_if_non_existent
+        except Exception:  # pylint: disable=broad-except
+            # This is to an best effort to find the latest controller status to
+            # print more helpful message, so we can ignore any exception to
+            # print the original error.
+            pass
+        if not msg:
+            msg = ('Failed to fetch service statuses due to connection issues. '
+                   'Please try again later. Details: '
+                   f'{common_utils.format_exception(e, use_bracket=True)}')
     except Exception as e:  # pylint: disable=broad-except
         msg = ('Failed to fetch service statuses: '
                f'{common_utils.format_exception(e, use_bracket=True)}')
@@ -1380,11 +1466,11 @@ def _get_services(service_names: Optional[List[str]],
               type=int,
               help=('Get the endpoint URL for the specified port number on the '
                     'cluster. This option will override all other options.'))
-@click.option('--show-spot-jobs/--no-show-spot-jobs',
+@click.option('--show-managed-jobs/--no-show-managed-jobs',
               default=True,
               is_flag=True,
               required=False,
-              help='Also show recent in-progress spot jobs, if any.')
+              help='Also show recent in-progress managed jobs, if any.')
 @click.option('--show-services/--no-show-services',
               default=True,
               is_flag=True,
@@ -1398,8 +1484,8 @@ def _get_services(service_names: Optional[List[str]],
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
 def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
-           endpoint: Optional[int], show_spot_jobs: bool, show_services: bool,
-           clusters: List[str]):
+           endpoint: Optional[int], show_managed_jobs: bool,
+           show_services: bool, clusters: List[str]):
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Show clusters.
 
@@ -1458,19 +1544,20 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
       or for autostop-enabled clusters, use ``--refresh`` to query the latest
       cluster statuses from the cloud providers.
     """
-    # Using a pool with 2 worker to run the spot job query and sky serve service
-    # query in parallel to speed up. The pool provides a AsyncResult object that
-    # can be used as a future.
+    # Using a pool with 2 worker to run the managed job query and sky serve
+    # service query in parallel to speed up. The pool provides a AsyncResult
+    # object that can be used as a future.
     with multiprocessing.Pool(2) as pool:
-        # Do not show spot queue if user specifies clusters, and if user
+        # Do not show job queue if user specifies clusters, and if user
         # specifies --ip or --endpoint(s).
-        show_spot_jobs = show_spot_jobs and not any([clusters, ip, endpoints])
+        show_managed_jobs = show_managed_jobs and not any(
+            [clusters, ip, endpoints])
         show_endpoints = endpoints or endpoint is not None
         show_single_endpoint = endpoint is not None
-        if show_spot_jobs:
-            # Run the spot job query in parallel to speed up the status query.
-            spot_jobs_future = pool.apply_async(
-                _get_spot_jobs,
+        if show_managed_jobs:
+            # Run managed job query in parallel to speed up the status query.
+            managed_jobs_future = pool.apply_async(
+                _get_managed_jobs,
                 kwds=dict(refresh=False,
                           skip_finished=True,
                           show_all=False,
@@ -1655,16 +1742,16 @@ def _try_get_future_result(future) -> Tuple[bool, Any]:
                 interrupted = True
             return interrupted, result
 
-        spot_jobs_query_interrupted = False
-        if show_spot_jobs:
+        managed_jobs_query_interrupted = False
+        if show_managed_jobs:
             click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                       f'Managed spot jobs{colorama.Style.RESET_ALL}')
-            with rich_utils.safe_status('[cyan]Checking spot jobs[/]'):
-                spot_jobs_query_interrupted, result = _try_get_future_result(
-                    spot_jobs_future)
-                if spot_jobs_query_interrupted:
+                       f'Managed jobs{colorama.Style.RESET_ALL}')
+            with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
+                managed_jobs_query_interrupted, result = _try_get_future_result(
+                    managed_jobs_future)
+                if managed_jobs_query_interrupted:
                     # Set to -1, so that the controller is not considered
-                    # down, and the hint for showing sky spot queue
+                    # down, and the hint for showing sky jobs queue
                     # will still be shown.
                     num_in_progress_jobs = -1
                     msg = 'KeyboardInterrupt'
@@ -1673,29 +1760,30 @@ def _try_get_future_result(future) -> Tuple[bool, Any]:
 
             click.echo(msg)
             if num_in_progress_jobs is not None:
-                # spot controller is UP.
+                # jobs controller is UP.
                 job_info = ''
                 if num_in_progress_jobs > 0:
                     plural_and_verb = ' is'
                     if num_in_progress_jobs > 1:
                         plural_and_verb = 's are'
                     job_info = (
-                        f'{num_in_progress_jobs} spot job{plural_and_verb} '
+                        f'{num_in_progress_jobs} managed job{plural_and_verb} '
                         'in progress')
-                    if num_in_progress_jobs > _NUM_SPOT_JOBS_TO_SHOW_IN_STATUS:
+                    if (num_in_progress_jobs >
+                            _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS):
                         job_info += (
-                            f' ({_NUM_SPOT_JOBS_TO_SHOW_IN_STATUS} latest ones '
-                            'shown)')
+                            f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
+                            'ones shown)')
                     job_info += '. '
                 hints.append(
-                    controller_utils.Controllers.SPOT_CONTROLLER.value.
+                    controller_utils.Controllers.JOBS_CONTROLLER.value.
                     in_progress_hint.format(job_info=job_info))
 
         if show_services:
             click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                        f'Services{colorama.Style.RESET_ALL}')
             num_services = None
-            if spot_jobs_query_interrupted:
+            if managed_jobs_query_interrupted:
                 # The pool is terminated, so we cannot run the service query.
                 msg = 'KeyboardInterrupt'
             else:
@@ -1712,7 +1800,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]:
                 hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
                              value.in_progress_hint)
 
-        if show_spot_jobs or show_services:
+        if show_managed_jobs or show_services:
             try:
                 pool.close()
                 pool.join()
@@ -1983,7 +2071,7 @@ def logs(
               help='Skip confirmation prompt.')
 @click.argument('jobs', required=False, type=int, nargs=-1)
 @usage_lib.entrypoint
-def cancel(cluster: str, all: bool, jobs: List[int], yes: bool):  # pylint: disable=redefined-builtin
+def cancel(cluster: str, all: bool, jobs: List[int], yes: bool):  # pylint: disable=redefined-builtin, redefined-outer-name
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Cancel job(s).
 
@@ -2382,7 +2470,7 @@ def start(
     if not to_start:
         return
 
-    # Checks for controller clusters (spot controller / sky serve controller).
+    # Checks for controller clusters (jobs controller / sky serve controller).
     controllers, normal_clusters = [], []
     for name in to_start:
         if controller_utils.Controllers.from_name(name) is not None:
@@ -2501,14 +2589,15 @@ def down(
                            purge=purge)
 
 
-def _hint_or_raise_for_down_spot_controller(controller_name: str):
+def _hint_or_raise_for_down_jobs_controller(controller_name: str):
     controller = controller_utils.Controllers.from_name(controller_name)
     assert controller is not None, controller_name
 
     with rich_utils.safe_status(
-            '[bold cyan]Checking for in-progress spot jobs[/]'):
+            '[bold cyan]Checking for in-progress managed jobs[/]'):
         try:
-            spot_jobs = spot_lib.queue(refresh=False, skip_finished=True)
+            managed_jobs_ = managed_jobs.queue(refresh=False,
+                                               skip_finished=True)
         except exceptions.ClusterNotUpError as e:
             if controller.value.connection_error_hint in str(e):
                 with ux_utils.print_exception_no_traceback():
@@ -2517,21 +2606,21 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str):
                         decline_down_when_failed_to_fetch_status_hint)
             if e.cluster_status is None:
                 click.echo(
-                    'Managed spot controller has already been torn down.')
+                    'Managed jobs controller has already been torn down.')
                 sys.exit(0)
-            # At this point, the spot jobs are failed to be fetched due to the
-            # controller being STOPPED or being firstly launched, i.e., there is
-            # no in-prgress spot jobs.
-            spot_jobs = []
+            # At this point, the managed jobs are failed to be fetched due to
+            # the controller being STOPPED or being firstly launched, i.e.,
+            # there is no in-prgress managed jobs.
+            managed_jobs_ = []
 
     msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
-           'spot controller. Please be aware of the following:'
+           'jobs controller. Please be aware of the following:'
            f'{colorama.Style.RESET_ALL}'
-           '\n * All logs and status information of the spot '
-           'jobs (output of `sky spot queue`) will be lost.')
+           '\n * All logs and status information of the managed '
+           'jobs (output of `sky jobs queue`) will be lost.')
     click.echo(msg)
-    if spot_jobs:
-        job_table = spot_lib.format_job_table(spot_jobs, show_all=False)
+    if managed_jobs_:
+        job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False)
         msg = controller.value.decline_down_for_dirty_controller_hint
         # Add prefix to each line to align with the bullet point.
         msg += '\n'.join(
@@ -2539,7 +2628,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str):
         with ux_utils.print_exception_no_traceback():
             raise exceptions.NotSupportedError(msg)
     else:
-        click.echo(' * No in-progress spot jobs found. It should be safe to '
+        click.echo(' * No in-progress managed jobs found. It should be safe to '
                    'terminate (see caveats above).')
 
 
@@ -2575,8 +2664,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
 
 
 _CONTROLLER_TO_HINT_OR_RAISE = {
-    controller_utils.Controllers.SPOT_CONTROLLER:
-        (_hint_or_raise_for_down_spot_controller),
+    controller_utils.Controllers.JOBS_CONTROLLER:
+        (_hint_or_raise_for_down_jobs_controller),
     controller_utils.Controllers.SKY_SERVE_CONTROLLER:
         (_hint_or_raise_for_down_sky_serve_controller),
 }
@@ -2591,9 +2680,9 @@ def _down_or_stop_clusters(
         idle_minutes_to_autostop: Optional[int] = None) -> None:
     """Tears down or (auto-)stops a cluster (or all clusters).
 
-    Controllers (spot controller and sky serve controller) can only be
+    Controllers (jobs controller and sky serve controller) can only be
     terminated if the cluster name is explicitly and uniquely specified (not
-    via glob) and purge is set to True.
+    via glob).
     """
     if down:
         command = 'down'
@@ -2662,10 +2751,10 @@ def _down_or_stop_clusters(
                     # TODO(zhwu): This hint or raise is not transactional, which
                     # means even if it passed the check with no in-progress spot
                     # or service and prompt the confirmation for termination,
-                    # a user could still do a `sky spot launch` or a
+                    # a user could still do a `sky jobs launch` or a
                     # `sky serve up` before typing the delete, causing a leaked
-                    # spot job or service. We should make this check atomic with
-                    # the termination.
+                    # managed job or service. We should make this check atomic
+                    # with the termination.
                     hint_or_raise(controller_name)
                 except exceptions.ClusterOwnerIdentityMismatchError as e:
                     if purge:
@@ -3147,12 +3236,12 @@ def bench():
 
 
 @cli.group(cls=_NaturalOrderGroup)
-def spot():
-    """Managed Spot CLI (spot instances with auto-recovery)."""
+def jobs():
+    """Managed Jobs CLI (jobs with auto-recovery)."""
     pass
 
 
-@spot.command('launch', cls=_DocumentedCodeCommand)
+@jobs.command('launch', cls=_DocumentedCodeCommand)
 @click.argument('entrypoint',
                 required=True,
                 type=str,
@@ -3160,10 +3249,10 @@ def spot():
                 **_get_shell_complete_args(_complete_file_name))
 # TODO(zhwu): Add --dryrun option to test the launch command.
 @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
-@click.option('--spot-recovery',
+@click.option('--job-recovery',
               default=None,
               type=str,
-              help='Spot recovery strategy to use for the managed spot task.')
+              help='Recovery strategy to use for managed jobs.')
 @click.option(
     '--detach-run',
     '-d',
@@ -3181,8 +3270,8 @@ def spot():
         '(Default: True; this flag is deprecated and will be removed in a '
         'future release.) Whether to retry provisioning infinitely until the '
         'cluster is up, if unavailability errors are encountered. This '  # pylint: disable=bad-docstring-quotes
-        'applies to launching the spot clusters (both the initial and any '
-        'recovery attempts), not the spot controller.'))
+        'applies to launching all managed jobs (both the initial and '
+        'any recovery attempts), not the jobs controller.'))
 @click.option('--yes',
               '-y',
               is_flag=True,
@@ -3191,7 +3280,7 @@ def spot():
               help='Skip confirmation prompt.')
 @timeline.event
 @usage_lib.entrypoint
-def spot_launch(
+def jobs_launch(
     entrypoint: List[str],
     name: Optional[str],
     workdir: Optional[str],
@@ -3205,7 +3294,7 @@ def spot_launch(
     num_nodes: Optional[int],
     use_spot: Optional[bool],
     image_id: Optional[str],
-    spot_recovery: Optional[str],
+    job_recovery: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
     disk_size: Optional[int],
@@ -3215,7 +3304,7 @@ def spot_launch(
     retry_until_up: bool,
     yes: bool,
 ):
-    """Launch a managed spot job from a YAML or a command.
+    """Launch a managed job from a YAML or a command.
 
     If ENTRYPOINT points to a valid YAML file, it is read in as the task
     specification. Otherwise, it is interpreted as a bash command.
@@ -3225,9 +3314,9 @@ def spot_launch(
     .. code-block:: bash
 
       # You can use normal task YAMLs.
-      sky spot launch task.yaml
+      sky jobs launch task.yaml
 
-      sky spot launch 'echo hello!'
+      sky jobs launch 'echo hello!'
     """
     env = _merge_env_vars(env_file, env)
     task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
@@ -3248,16 +3337,17 @@ def spot_launch(
         disk_size=disk_size,
         disk_tier=disk_tier,
         ports=ports,
-        spot_recovery=spot_recovery,
+        job_recovery=job_recovery,
     )
-    # Deprecation.
+    # Deprecation. We set the default behavior to be retry until up, and the
+    # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
     if retry_until_up is not None:
         flag_str = '--retry-until-up'
         if not retry_until_up:
             flag_str = '--no-retry-until-up'
         click.secho(
             f'Flag {flag_str} is deprecated and will be removed in a '
-            'future release (managed spot jobs will always be retried). '
+            'future release (managed jobs will always be retried). '
             'Please file an issue if this does not work for you.',
             fg='yellow')
     else:
@@ -3275,27 +3365,26 @@ def spot_launch(
         dag.name = name
 
     dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
-    dag_utils.fill_default_spot_config_in_dag_for_spot_launch(dag)
+    dag_utils.fill_default_config_in_dag_for_job_launch(dag)
 
-    click.secho(
-        f'Managed spot job {dag.name!r} will be launched on (estimated):',
-        fg='yellow')
+    click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
+                fg='yellow')
     dag = sky.optimize(dag)
 
     if not yes:
-        prompt = f'Launching the spot job {dag.name!r}. Proceed?'
+        prompt = f'Launching a managed job {dag.name!r}. Proceed?'
         if prompt is not None:
             click.confirm(prompt, default=True, abort=True, show_default=True)
 
     common_utils.check_cluster_name_is_valid(name)
 
-    spot_lib.launch(dag,
-                    name,
-                    detach_run=detach_run,
-                    retry_until_up=retry_until_up)
+    managed_jobs.launch(dag,
+                        name,
+                        detach_run=detach_run,
+                        retry_until_up=retry_until_up)
 
 
-@spot.command('queue', cls=_DocumentedCodeCommand)
+@jobs.command('queue', cls=_DocumentedCodeCommand)
 @click.option('--all',
               '-a',
               default=False,
@@ -3308,7 +3397,7 @@ def spot_launch(
     default=False,
     is_flag=True,
     required=False,
-    help='Query the latest statuses, restarting the spot controller if stopped.'
+    help='Query the latest statuses, restarting the jobs controller if stopped.'
 )
 @click.option('--skip-finished',
               '-s',
@@ -3318,21 +3407,21 @@ def spot_launch(
               help='Show only pending/running jobs\' information.')
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
-def spot_queue(all: bool, refresh: bool, skip_finished: bool):
-    """Show statuses of managed spot jobs.
+def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
+    """Show statuses of managed jobs.
 
-    Each spot job can have one of the following statuses:
+    Each managed jobs can have one of the following statuses:
 
-    - ``PENDING``: Job is waiting for a free slot on the spot controller to be
+    - ``PENDING``: Job is waiting for a free slot on the jobs controller to be
       accepted.
 
-    - ``SUBMITTED``: Job is submitted to and accepted by the spot controller.
+    - ``SUBMITTED``: Job is submitted to and accepted by the jobs controller.
 
-    - ``STARTING``: Job is starting (provisioning a spot cluster).
+    - ``STARTING``: Job is starting (provisioning a cluster for the job).
 
     - ``RUNNING``: Job is running.
 
-    - ``RECOVERING``: The spot cluster is recovering from a preemption.
+    - ``RECOVERING``: The cluster of the job is recovering from a preemption.
 
     - ``SUCCEEDED``: Job succeeded.
 
@@ -3355,12 +3444,12 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool):
     - ``FAILED_CONTROLLER``: Job failed due to an unexpected error in the spot
       controller.
 
-    If the job failed, either due to user code or spot unavailability, the
-    error log can be found with ``sky spot logs --controller``, e.g.:
+    If the job failed, either due to user code or resource unavailability, the
+    error log can be found with ``sky jobs logs --controller``, e.g.:
 
     .. code-block:: bash
 
-      sky spot logs --controller job_id
+      sky jobs logs --controller job_id
 
     This also shows the logs for provisioning and any preemption and recovery
     attempts.
@@ -3369,37 +3458,37 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool):
 
     .. code-block:: bash
 
-      watch -n60 sky spot queue
+      watch -n60 sky jobs queue
 
     """
-    click.secho('Fetching managed spot job statuses...', fg='yellow')
-    with rich_utils.safe_status('[cyan]Checking spot jobs[/]'):
-        _, msg = _get_spot_jobs(refresh=refresh,
-                                skip_finished=skip_finished,
-                                show_all=all,
-                                is_called_by_user=True)
+    click.secho('Fetching managed job statuses...', fg='yellow')
+    with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
+        _, msg = _get_managed_jobs(refresh=refresh,
+                                   skip_finished=skip_finished,
+                                   show_all=all,
+                                   is_called_by_user=True)
     if not skip_finished:
         in_progress_only_hint = ''
     else:
         in_progress_only_hint = ' (showing in-progress jobs only)'
     click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-               f'Managed spot jobs{colorama.Style.RESET_ALL}'
+               f'Managed jobs{colorama.Style.RESET_ALL}'
                f'{in_progress_only_hint}\n{msg}')
 
 
-@spot.command('cancel', cls=_DocumentedCodeCommand)
+@jobs.command('cancel', cls=_DocumentedCodeCommand)
 @click.option('--name',
               '-n',
               required=False,
               type=str,
-              help='Managed spot job name to cancel.')
+              help='Managed job name to cancel.')
 @click.argument('job_ids', default=None, type=int, required=False, nargs=-1)
 @click.option('--all',
               '-a',
               is_flag=True,
               default=False,
               required=False,
-              help='Cancel all managed spot jobs.')
+              help='Cancel all managed jobs.')
 @click.option('--yes',
               '-y',
               is_flag=True,
@@ -3408,8 +3497,8 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool):
               help='Skip confirmation prompt.')
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
-def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
-    """Cancel managed spot jobs.
+def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
+    """Cancel managed jobs.
 
     You can provide either a job name or a list of job IDs to be cancelled.
     They are exclusive options.
@@ -3418,15 +3507,15 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
 
     .. code-block:: bash
 
-      # Cancel managed spot job with name 'my-job'
-      $ sky spot cancel -n my-job
+      # Cancel managed job with name 'my-job'
+      $ sky jobs cancel -n my-job
       \b
-      # Cancel managed spot jobs with IDs 1, 2, 3
-      $ sky spot cancel 1 2 3
+      # Cancel managed jobs with IDs 1, 2, 3
+      $ sky jobs cancel 1 2 3
     """
     backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SPOT_CONTROLLER,
-        stopped_message='All managed spot jobs should have finished.',
+        controller=controller_utils.Controllers.JOBS_CONTROLLER,
+        stopped_message='All managed jobs should have finished.',
         exit_if_not_accessible=True)
 
     job_id_str = ','.join(map(str, job_ids))
@@ -3439,24 +3528,24 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
             f'Provided {argument_str!r}.')
 
     if not yes:
-        job_identity_str = (f'managed spot jobs with IDs {job_id_str}'
+        job_identity_str = (f'managed jobs with IDs {job_id_str}'
                             if job_ids else repr(name))
         if all:
-            job_identity_str = 'all managed spot jobs'
+            job_identity_str = 'all managed jobs'
         click.confirm(f'Cancelling {job_identity_str}. Proceed?',
                       default=True,
                       abort=True,
                       show_default=True)
 
-    spot_lib.cancel(job_ids=job_ids, name=name, all=all)
+    managed_jobs.cancel(job_ids=job_ids, name=name, all=all)
 
 
-@spot.command('logs', cls=_DocumentedCodeCommand)
+@jobs.command('logs', cls=_DocumentedCodeCommand)
 @click.option('--name',
               '-n',
               required=False,
               type=str,
-              help='Managed spot job name.')
+              help='Managed job name.')
 @click.option(
     '--follow/--no-follow',
     is_flag=True,
@@ -3471,22 +3560,23 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
           'launching/recoveries, etc.'))
 @click.argument('job_id', required=False, type=int)
 @usage_lib.entrypoint
-def spot_logs(name: Optional[str], job_id: Optional[int], follow: bool,
+def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
               controller: bool):
-    """Tail the log of a managed spot job."""
+    """Tail the log of a managed job."""
     try:
         if controller:
-            core.tail_logs(spot_lib.SPOT_CONTROLLER_NAME,
-                           job_id=job_id,
-                           follow=follow)
+            core.tail_logs(
+                controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name,
+                job_id=job_id,
+                follow=follow)
         else:
-            spot_lib.tail_logs(name=name, job_id=job_id, follow=follow)
+            managed_jobs.tail_logs(name=name, job_id=job_id, follow=follow)
     except exceptions.ClusterNotUpError as e:
         click.echo(e)
         sys.exit(1)
 
 
-@spot.command('dashboard', cls=_DocumentedCodeCommand)
+@jobs.command('dashboard', cls=_DocumentedCodeCommand)
 @click.option(
     '--port',
     '-p',
@@ -3496,19 +3586,18 @@ def spot_logs(name: Optional[str], job_id: Optional[int], follow: bool,
     help=('Local port to use for the dashboard. If None, a free port is '
           'automatically chosen.'))
 @usage_lib.entrypoint
-def spot_dashboard(port: Optional[int]):
-    """Opens a dashboard for spot jobs (needs controller to be UP)."""
+def jobs_dashboard(port: Optional[int]):
+    """Opens a dashboard for managed jobs (needs controller to be UP)."""
     # TODO(zongheng): ideally, the controller/dashboard server should expose the
     # API perhaps via REST. Then here we would (1) not have to use SSH to try to
     # see if the controller is UP first, which is slow; (2) not have to run SSH
     # port forwarding first (we'd just launch a local dashboard which would make
     # REST API calls to the controller dashboard server).
-    click.secho('Checking if spot controller is up...', fg='yellow')
-    hint = (
-        'Dashboard is not available if spot controller is not up. Run a spot '
-        'job first.')
+    click.secho('Checking if jobs controller is up...', fg='yellow')
+    hint = ('Dashboard is not available if jobs controller is not up. Run a '
+            'managed job first.')
     backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SPOT_CONTROLLER,
+        controller=controller_utils.Controllers.JOBS_CONTROLLER,
         stopped_message=hint,
         non_existent_message=hint,
         exit_if_not_accessible=True)
@@ -3519,8 +3608,9 @@ def spot_dashboard(port: Optional[int]):
         free_port = common_utils.find_free_port(remote_port)
     else:
         free_port = port
-    ssh_command = (f'ssh -qNL {free_port}:localhost:{remote_port} '
-                   f'{spot_lib.SPOT_CONTROLLER_NAME}')
+    ssh_command = (
+        f'ssh -qNL {free_port}:localhost:{remote_port} '
+        f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}')
     click.echo('Forwarding port: ', nl=False)
     click.secho(f'{ssh_command}', dim=True)
 
@@ -3539,12 +3629,31 @@ def spot_dashboard(port: Optional[int]):
             try:
                 os.killpg(os.getpgid(ssh_process.pid), signal.SIGTERM)
             except ProcessLookupError:
-                # This happens if spot controller is auto-stopped.
+                # This happens if jobs controller is auto-stopped.
                 pass
         finally:
             click.echo('Exiting.')
 
 
+# TODO(zhwu): Backward compatibility for the old `sky spot launch` command.
+# It is now renamed to `sky jobs launch` and the old command is deprecated.
+# Remove in v0.8.0.
+@cli.group(cls=_NaturalOrderGroup)
+def spot():
+    """Alias for Managed Jobs CLI (default to managed spot jobs)."""
+    pass
+
+
+_add_command_alias(jobs,
+                   jobs_launch,
+                   new_group=spot,
+                   override_command_argument={'use_spot': True})
+_add_command_alias(jobs, jobs_queue, new_group=spot)
+_add_command_alias(jobs, jobs_logs, new_group=spot)
+_add_command_alias(jobs, jobs_cancel, new_group=spot)
+_add_command_alias(jobs, jobs_dashboard, new_group=spot)
+
+
 @cli.group(cls=_NaturalOrderGroup)
 def serve():
     """SkyServe CLI (multi-region, multi-cloud serving)."""
@@ -4040,7 +4149,7 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
             f'Provided {argument_str!r}.')
 
     backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
+        controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
         stopped_message='All services should have been terminated.',
         exit_if_not_accessible=True)
 
diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
index 542d1595d86..1fef481d8d0 100644
--- a/sky/clouds/aws.py
+++ b/sky/clouds/aws.py
@@ -37,7 +37,7 @@
 # It has the following purposes:
 #   - make all nodes (any cloud) able to access private S3 buckets
 #   - make some remote nodes able to launch new nodes on AWS (i.e., makes
-#     AWS head node able to launch AWS workers, or any-cloud spot controller
+#     AWS head node able to launch AWS workers, or any-cloud jobs controller
 #     able to launch spot clusters on AWS).
 #
 # If we detect the current user identity is AWS SSO, we will not upload this
@@ -541,9 +541,9 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
         elif identity_type == AWSIdentityType.IAM_ROLE:
             # When using an IAM role, the credentials may not exist in the
             # ~/.aws/credentials file. So we don't check for the existence of the
-            # file. This will happen when the user is on a VM (or spot-controller)
-            # created by an SSO account, i.e. the VM will be assigned the IAM
-            # role: skypilot-v1.
+            # file. This will happen when the user is on a VM (or
+            # jobs-controller) created by an SSO account, i.e. the VM will be
+            # assigned the IAM role: skypilot-v1.
             hints = f'AWS IAM role is set.{single_cloud_hint}'
         else:
             # This file is required because it is required by the VMs launched on
@@ -745,7 +745,7 @@ def get_credential_file_mounts(self) -> Dict[str, str]:
         # credentials. We need to define a mechanism to find out the cloud
         # provider of the cluster to be launched in this function and make sure
         # the cluster will not be used for launching clusters in other clouds,
-        # e.g. spot controller.
+        # e.g. jobs controller.
         if self._current_identity_type(
         ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
             return {}
diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py
index d8e77f6f194..889e6716074 100644
--- a/sky/clouds/cloud.py
+++ b/sky/clouds/cloud.py
@@ -42,7 +42,7 @@ class CloudImplementationFeatures(enum.Enum):
     CUSTOM_DISK_TIER = 'custom_disk_tier'
     OPEN_PORTS = 'open_ports'
     STORAGE_MOUNTING = 'storage_mounting'
-    HOST_CONTROLLERS = 'host_controllers'  # Can run spot/serve controllers
+    HOST_CONTROLLERS = 'host_controllers'  # Can run jobs/serve controllers
 
 
 class Region(collections.namedtuple('Region', ['name'])):
@@ -496,15 +496,16 @@ def validate_region_zone(
                                                     zone,
                                                     clouds=self._REPR.lower())
 
-    def need_cleanup_after_preemption(
+    def need_cleanup_after_preemption_or_failure(
             self, resources: 'resources_lib.Resources') -> bool:
-        """Returns whether a spot resource needs cleanup after preeemption.
+        """Whether a resource needs cleanup after preeemption or failure.
 
         In most cases, spot resources do not need cleanup after preemption,
         as long as the cluster can be relaunched with the same name and tag,
         no matter the preemption behavior is to terminate or stop the cluster.
-        The only exception by far is GCP's Spot TPU VM. We override this method
-        in gcp.py.
+        Similar for on-demand resources that go into maintenance mode. The
+        only exception by far is GCP's TPU VM. We override this method in
+        gcp.py.
         """
         del resources
         return False
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index 4557fd18678..7babf34ac52 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -841,13 +841,14 @@ def get_current_user_identity_str(cls) -> Optional[str]:
     def instance_type_exists(self, instance_type):
         return service_catalog.instance_type_exists(instance_type, 'gcp')
 
-    def need_cleanup_after_preemption(self,
-                                      resources: 'resources.Resources') -> bool:
-        """Returns whether a spot resource needs cleanup after preeemption."""
+    def need_cleanup_after_preemption_or_failure(
+            self, resources: 'resources.Resources') -> bool:
+        """Whether a resource needs cleanup after preeemption or failure."""
         # Spot TPU VMs require manual cleanup after preemption.
         # "If your Cloud TPU is preempted,
         # you must delete it and create a new one ..."
         # See: https://cloud.google.com/tpu/docs/preemptible#tpu-vm
+        # On-demand TPU VMs are likely to require manual cleanup as well.
 
         return gcp_utils.is_tpu_vm(resources)
 
diff --git a/sky/core.py b/sky/core.py
index c93a50f0b7d..c71a3fa9734 100644
--- a/sky/core.py
+++ b/sky/core.py
@@ -196,8 +196,6 @@ def _start(
         idle_minutes_to_autostop = (
             constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP)
 
-    # NOTE: if spot_queue() calls _start() and hits here, that entrypoint
-    # would have a cluster name (the controller) filled in.
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
 
     with dag.Dag():
@@ -264,7 +262,7 @@ def start(
         ValueError: argument values are invalid: (1) the specified cluster does
           not exist; (2) if ``down`` is set to True but
           ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
-          the managed spot controller, and either ``idle_minutes_to_autostop``
+          the managed jobs controller, and either ``idle_minutes_to_autostop``
           is not None or ``down`` is True (omit them to use the default
           autostop settings).
         sky.exceptions.NotSupportedError: if the cluster to restart was
@@ -317,7 +315,7 @@ def stop(cluster_name: str, purge: bool = False) -> None:
         ValueError: the specified cluster does not exist.
         RuntimeError: failed to stop the cluster.
         sky.exceptions.NotSupportedError: if the specified cluster is a spot
-          cluster, or a TPU VM Pod cluster, or the managed spot controller.
+          cluster, or a TPU VM Pod cluster, or the managed jobs controller.
     """
     if controller_utils.Controllers.from_name(cluster_name) is not None:
         raise exceptions.NotSupportedError(
@@ -372,7 +370,7 @@ def down(cluster_name: str, purge: bool = False) -> None:
         ValueError: the specified cluster does not exist.
         RuntimeError: failed to tear down the cluster.
         sky.exceptions.NotSupportedError: the specified cluster is the managed
-          spot controller.
+          jobs controller.
     """
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
@@ -559,7 +557,7 @@ def cancel(
     Additional arguments:
         _try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
             even if the cluster is not UP, but the head node is still alive.
-            This is used by the spot controller to cancel the job when the
+            This is used by the jobs controller to cancel the job when the
             worker node is preempted in the spot cluster.
 
     Raises:
diff --git a/sky/exceptions.py b/sky/exceptions.py
index 131b4675399..e3b33ea3e5e 100644
--- a/sky/exceptions.py
+++ b/sky/exceptions.py
@@ -52,11 +52,12 @@ class InvalidCloudConfigs(Exception):
 
 
 class ProvisionPrechecksError(Exception):
-    """Raised when a spot job fails prechecks before provision.
+    """Raised when a managed job fails prechecks before provision.
+
     Developer note: For now this should only be used by managed
-    spot code path (technically, this can/should be raised by the
+    jobs code path (technically, this can/should be raised by the
     lower-level sky.launch()). Please refer to the docstring of
-    `spot.recovery_strategy._launch` for more details about when
+    `jobs.recovery_strategy._launch` for more details about when
     the error will be raised.
 
     Args:
@@ -68,11 +69,11 @@ def __init__(self, reasons: List[Exception]) -> None:
         self.reasons = list(reasons)
 
 
-class SpotJobReachedMaxRetriesError(Exception):
-    """Raised when a spot job fails to be launched after maximum retries.
+class ManagedJobReachedMaxRetriesError(Exception):
+    """Raised when a managed job fails to be launched after maximum retries.
 
-    Developer note: For now this should only be used by managed spot code
-    path. Please refer to the docstring of `spot.recovery_strategy._launch`
+    Developer note: For now this should only be used by managed jobs code
+    path. Please refer to the docstring of `jobs.recovery_strategy._launch`
     for more details about when the error will be raised.
     """
     pass
@@ -211,8 +212,8 @@ class ClusterStatusFetchingError(Exception):
     pass
 
 
-class SpotUserCancelledError(Exception):
-    """Raised when a spot user cancels the job."""
+class ManagedJobUserCancelledError(Exception):
+    """Raised when a user cancels a managed job."""
     pass
 
 
diff --git a/sky/execution.py b/sky/execution.py
index 25f0d8cc7a8..2cffc5a7d09 100644
--- a/sky/execution.py
+++ b/sky/execution.py
@@ -108,7 +108,7 @@ def _execute(
     clone_disk_from: Optional[str] = None,
     # Internal only:
     # pylint: disable=invalid-name
-    _is_launched_by_spot_controller: bool = False,
+    _is_launched_by_jobs_controller: bool = False,
     _is_launched_by_sky_serve_controller: bool = False,
 ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
     """Execute an entrypoint.
@@ -160,11 +160,11 @@ def _execute(
     assert len(dag) == 1, f'We support 1 task for now. {dag}'
     task = dag.tasks[0]
 
-    if task.need_spot_recovery:
+    if any(r.job_recovery is not None for r in task.resources):
         with ux_utils.print_exception_no_traceback():
             raise ValueError(
-                'Spot recovery is specified in the task. To launch the '
-                'managed spot job, please use: sky spot launch')
+                'Job recovery is specified in the task. To launch a '
+                'managed job, please use: sky jobs launch')
 
     cluster_exists = False
     if cluster_name is not None:
@@ -225,10 +225,10 @@ def _execute(
                                               task)
 
     if not cluster_exists:
-        # If spot is launched by skyserve controller or managed spot controller,
-        # We don't need to print out the logger info.
+        # If spot is launched on serve or jobs controller, we don't need to
+        # print out the hint.
         if (Stage.PROVISION in stages and task.use_spot and
-                not _is_launched_by_spot_controller and
+                not _is_launched_by_jobs_controller and
                 not _is_launched_by_sky_serve_controller):
             yellow = colorama.Fore.YELLOW
             bold = colorama.Style.BRIGHT
@@ -236,9 +236,9 @@ def _execute(
             logger.info(
                 f'{yellow}Launching an unmanaged spot task, which does not '
                 f'automatically recover from preemptions.{reset}\n{yellow}To '
-                'get automatic recovery, use managed spot instead: '
-                f'{reset}{bold}sky spot launch{reset} {yellow}or{reset} '
-                f'{bold}sky.spot.launch(){reset}.')
+                'get automatic recovery, use managed job instead: '
+                f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
+                f'{bold}sky.jobs.launch(){reset}.')
 
         if Stage.OPTIMIZE in stages:
             if task.best_resources is None:
@@ -318,10 +318,10 @@ def _execute(
         if controller is None and not _is_launched_by_sky_serve_controller:
             # UX: print live clusters to make users aware (to save costs).
             #
-            # Don't print if this job is launched by the spot controller,
-            # because spot jobs are serverless, there can be many of them, and
-            # users tend to continuously monitor spot jobs using `sky spot
-            # status`. Also don't print if this job is a skyserve controller
+            # Don't print if this job is launched by the jobs controller,
+            # because managed jobs are serverless, there can be many of them,
+            # and users tend to continuously monitor managed jobs using `sky
+            # job queue`. Also don't print if this job is a skyserve controller
             # job or launched by a skyserve controller job, because the
             # redirect for this subprocess.run won't success and it will
             # pollute the controller logs.
@@ -330,7 +330,7 @@ def _execute(
             env = dict(os.environ,
                        **{env_options.Options.DISABLE_LOGGING.value: '1'})
             subprocess_utils.run(
-                'sky status --no-show-spot-jobs --no-show-services', env=env)
+                'sky status --no-show-managed-jobs --no-show-services', env=env)
         print()
         print('\x1b[?25h', end='')  # Show cursor.
     return job_id, handle
@@ -354,7 +354,7 @@ def launch(
     clone_disk_from: Optional[str] = None,
     # Internal only:
     # pylint: disable=invalid-name
-    _is_launched_by_spot_controller: bool = False,
+    _is_launched_by_jobs_controller: bool = False,
     _is_launched_by_sky_serve_controller: bool = False,
     _disable_controller_check: bool = False,
 ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
@@ -464,7 +464,7 @@ def launch(
         idle_minutes_to_autostop=idle_minutes_to_autostop,
         no_setup=no_setup,
         clone_disk_from=clone_disk_from,
-        _is_launched_by_spot_controller=_is_launched_by_spot_controller,
+        _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
         _is_launched_by_sky_serve_controller=
         _is_launched_by_sky_serve_controller,
     )
diff --git a/sky/spot/README.md b/sky/jobs/README.md
similarity index 52%
rename from sky/spot/README.md
rename to sky/jobs/README.md
index ed20a77b46e..579f675a5f9 100644
--- a/sky/spot/README.md
+++ b/sky/jobs/README.md
@@ -1,11 +1,11 @@
-# SkyPilot Managed Spot
+# SkyPilot Managed Jobs
 
-This module is used for running user jobs on spot clusters, which automatically recovers the job from preemptions.
+This module is used for running and managing user jobs, which automatically recovers failed jobs from spot preemptions and/or machine failures.
 
 ## Concepts
 
-- Task: A task (sky.Task) is a unit of work. SkyPilot will launch a spot cluster to run the task, automatically recover the task from preemptions, and terminate the cluster when the task is done.
-- Job: A job in the context of SkyPilot managed spot, is equivalent to a SkyPilot DAG (sky.Dag). A job is a collection of tasks that are executed in a specific order based on the dependencies between the tasks. Each controller process will be in charge of the whole lifecycle of a job.
+- Task: A task (sky.Task) is a unit of work. SkyPilot will launch a cluster to run the task, automatically recover the task from preemptions, and terminate the cluster when the task is done.
+- Job: A job in the context of SkyPilot managed jobs, is equivalent to a SkyPilot DAG (sky.Dag). A job is a collection of tasks that are executed in a specific order based on the dependencies between the tasks. Each controller process will be in charge of the whole lifecycle of a job.
 
 Note that for singleton (1-task) jobs, we will use the term "task" and "job" interchangeably.
 
@@ -14,6 +14,6 @@ A job of n tasks (experimental; we support a pipeline of such tasks only): the j
 
 ## Architecture
 
-![Architecture](../../docs/source/images/spot-controller.png)
-
+![Architecture](../../docs/source/images/managed-jobs-arch.png)
+<!-- Raw file: https://docs.google.com/presentation/d/1AoFewsxm7jEsnFYyovyuTqKZs8W59qD9sNcM7Wcic4I/edit#slide=id.p -->
 
diff --git a/sky/jobs/__init__.py b/sky/jobs/__init__.py
new file mode 100644
index 00000000000..922bb613ff7
--- /dev/null
+++ b/sky/jobs/__init__.py
@@ -0,0 +1,43 @@
+"""Managed jobs."""
+import pathlib
+
+from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
+from sky.jobs.constants import JOBS_CONTROLLER_TEMPLATE
+from sky.jobs.constants import JOBS_CONTROLLER_YAML_PREFIX
+from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
+from sky.jobs.core import cancel
+from sky.jobs.core import launch
+from sky.jobs.core import queue
+from sky.jobs.core import tail_logs
+from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
+from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
+from sky.jobs.state import ManagedJobStatus
+from sky.jobs.utils import dump_managed_job_queue
+from sky.jobs.utils import format_job_table
+from sky.jobs.utils import JOB_CONTROLLER_NAME
+from sky.jobs.utils import load_managed_job_queue
+from sky.jobs.utils import ManagedJobCodeGen
+
+pathlib.Path(JOBS_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
+                                                              exist_ok=True)
+__all__ = [
+    'RECOVERY_STRATEGIES',
+    'DEFAULT_RECOVERY_STRATEGY',
+    'JOB_CONTROLLER_NAME',
+    # Constants
+    'JOBS_CONTROLLER_TEMPLATE',
+    'JOBS_CONTROLLER_YAML_PREFIX',
+    'JOBS_TASK_YAML_PREFIX',
+    # Enums
+    'ManagedJobStatus',
+    # Core
+    'cancel',
+    'launch',
+    'queue',
+    'tail_logs',
+    # utils
+    'ManagedJobCodeGen',
+    'format_job_table',
+    'dump_managed_job_queue',
+    'load_managed_job_queue',
+]
diff --git a/sky/jobs/constants.py b/sky/jobs/constants.py
new file mode 100644
index 00000000000..d5f32908317
--- /dev/null
+++ b/sky/jobs/constants.py
@@ -0,0 +1,27 @@
+"""Constants used for Managed Jobs."""
+
+JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
+JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
+
+JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
+
+# Resources as a dict for the jobs controller.
+# Use default CPU instance type for jobs controller with >= 24GB, i.e.
+# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
+# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
+# Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
+# OOM (each vCPU can have 4 jobs controller processes as we set the CPU
+# requirement to 0.25, and 3 GB is barely enough for 4 job processes).
+# We use 50 GB disk size to reduce the cost.
+CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
+
+# Max length of the cluster name for GCP is 35, the user hash to be attached is
+# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
+# length of the cluster name prefix is 25 to avoid the cluster name being too
+# long and truncated twice during the cluster creation.
+JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
+
+# The version of the lib files that jobs/utils use. Whenever there is an API
+# change for the jobs/utils, we need to bump this version and update
+# job.utils.ManagedJobCodeGen to handle the version update.
+MANAGED_JOBS_VERSION = 1
diff --git a/sky/spot/controller.py b/sky/jobs/controller.py
similarity index 66%
rename from sky/spot/controller.py
rename to sky/jobs/controller.py
index 9308d1dd532..39c89d2784b 100644
--- a/sky/spot/controller.py
+++ b/sky/jobs/controller.py
@@ -1,4 +1,4 @@
-"""Controller: handles the life cycle of a managed spot cluster (job)."""
+"""Controller: handles the life cycle of a managed job."""
 import argparse
 import multiprocessing
 import os
@@ -15,11 +15,11 @@
 from sky import status_lib
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
+from sky.jobs import recovery_strategy
+from sky.jobs import state as managed_job_state
+from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants
 from sky.skylet import job_lib
-from sky.spot import recovery_strategy
-from sky.spot import spot_state
-from sky.spot import spot_utils
 from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import controller_utils
@@ -31,9 +31,9 @@
     import sky
 
 # Use the explicit logger name so that the logger is under the
-# `sky.spot.controller` namespace when executed directly, so as
+# `sky.jobs.controller` namespace when executed directly, so as
 # to inherit the setup from the `sky` logger.
-logger = sky_logging.init_logger('sky.spot.controller')
+logger = sky_logging.init_logger('sky.jobs.controller')
 
 
 def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
@@ -43,8 +43,8 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
     return dag, dag_name
 
 
-class SpotController:
-    """Each spot controller manages the life cycle of one spot job."""
+class JobsController:
+    """Each jobs controller manages the life cycle of one managed job."""
 
     def __init__(self, job_id: int, dag_yaml: str,
                  retry_until_up: bool) -> None:
@@ -88,23 +88,23 @@ def __init__(self, job_id: int, dag_yaml: str,
     def _download_log_and_stream(
             self,
             handle: cloud_vm_ray_backend.CloudVmRayResourceHandle) -> None:
-        """Downloads and streams the logs of the latest job of a spot cluster.
+        """Downloads and streams the logs of the latest job.
 
-        We do not stream the logs from the spot cluster directly, as the
+        We do not stream the logs from the cluster directly, as the
         donwload and stream should be faster, and more robust against
         preemptions or ssh disconnection during the streaming.
         """
-        spot_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                         'spot_jobs')
+        managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
+                                            'managed_jobs')
         controller_utils.download_and_stream_latest_job_log(
-            self._backend, handle, spot_job_logs_dir)
+            self._backend, handle, managed_job_logs_dir)
         logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
 
     def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
-        """Busy loop monitoring spot cluster status and handling recovery.
+        """Busy loop monitoring cluster status and handling recovery.
 
         When the task is successfully completed, this function returns True,
-        and will terminate the spot cluster before returning.
+        and will terminate the cluster before returning.
 
         If the user program fails, i.e. the task is set to FAILED or
         FAILED_SETUP, this function will return False.
@@ -130,28 +130,28 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 due to:
                 1. Any of the underlying failover exceptions is due to resources
                 unavailability.
-                2. The cluster is preempted before the job is submitted.
+                2. The cluster is preempted or failed before the job is
+                submitted.
                 3. Any unexpected error happens during the `sky.launch`.
         Other exceptions may be raised depending on the backend.
         """
 
-        callback_func = spot_utils.event_callback_func(job_id=self._job_id,
-                                                       task_id=task_id,
-                                                       task=task)
+        callback_func = managed_job_utils.event_callback_func(
+            job_id=self._job_id, task_id=task_id, task=task)
         if task.run is None:
             logger.info(f'Skip running task {task_id} ({task.name}) due to its '
                         'run commands being empty.')
             # Call set_started first to initialize columns in the state table,
             # including start_at and last_recovery_at to avoid issues for
             # uninitialized columns.
-            spot_state.set_started(job_id=self._job_id,
-                                   task_id=task_id,
-                                   start_time=time.time(),
-                                   callback_func=callback_func)
-            spot_state.set_succeeded(job_id=self._job_id,
-                                     task_id=task_id,
-                                     end_time=time.time(),
-                                     callback_func=callback_func)
+            managed_job_state.set_started(job_id=self._job_id,
+                                          task_id=task_id,
+                                          start_time=time.time(),
+                                          callback_func=callback_func)
+            managed_job_state.set_succeeded(job_id=self._job_id,
+                                            task_id=task_id,
+                                            end_time=time.time(),
+                                            callback_func=callback_func)
             return True
         usage_lib.messages.usage.update_task_id(task_id)
         task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
@@ -159,64 +159,65 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         if task_id == 0:
             submitted_at = backend_utils.get_timestamp_from_run_timestamp(
                 self._backend.run_timestamp)
-        spot_state.set_submitted(
+        managed_job_state.set_submitted(
             self._job_id,
             task_id,
             self._backend.run_timestamp,
             submitted_at,
-            resources_str=backend_utils.get_task_resources_str(task),
+            resources_str=backend_utils.get_task_resources_str(
+                task, is_managed_job=True),
             callback_func=callback_func)
         logger.info(
-            f'Submitted spot job {self._job_id} (task: {task_id}, name: '
+            f'Submitted managed job {self._job_id} (task: {task_id}, name: '
             f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
         assert task.name is not None, task
-        cluster_name = spot_utils.generate_spot_cluster_name(
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, self._job_id)
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
             cluster_name, self._backend, task, self._retry_until_up)
 
         logger.info('Started monitoring.')
-        spot_state.set_starting(job_id=self._job_id,
-                                task_id=task_id,
-                                callback_func=callback_func)
+        managed_job_state.set_starting(job_id=self._job_id,
+                                       task_id=task_id,
+                                       callback_func=callback_func)
         remote_job_submitted_at = self._strategy_executor.launch()
         assert remote_job_submitted_at is not None, remote_job_submitted_at
 
-        spot_state.set_started(job_id=self._job_id,
-                               task_id=task_id,
-                               start_time=remote_job_submitted_at,
-                               callback_func=callback_func)
+        managed_job_state.set_started(job_id=self._job_id,
+                                      task_id=task_id,
+                                      start_time=remote_job_submitted_at,
+                                      callback_func=callback_func)
         while True:
-            time.sleep(spot_utils.JOB_STATUS_CHECK_GAP_SECONDS)
+            time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
 
             # Check the network connection to avoid false alarm for job failure.
             # Network glitch was observed even in the VM.
             try:
                 backend_utils.check_network_connection()
             except exceptions.NetworkError:
-                logger.info(
-                    'Network is not available. Retrying again in '
-                    f'{spot_utils.JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
+                logger.info('Network is not available. Retrying again in '
+                            f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
+                            'seconds.')
                 continue
 
             # NOTE: we do not check cluster status first because race condition
             # can occur, i.e. cluster can be down during the job status check.
-            job_status = spot_utils.get_job_status(self._backend, cluster_name)
+            job_status = managed_job_utils.get_job_status(
+                self._backend, cluster_name)
 
             if job_status == job_lib.JobStatus.SUCCEEDED:
-                end_time = spot_utils.get_job_timestamp(self._backend,
-                                                        cluster_name,
-                                                        get_end_time=True)
+                end_time = managed_job_utils.get_job_timestamp(
+                    self._backend, cluster_name, get_end_time=True)
                 # The job is done.
-                spot_state.set_succeeded(self._job_id,
-                                         task_id,
-                                         end_time=end_time,
-                                         callback_func=callback_func)
+                managed_job_state.set_succeeded(self._job_id,
+                                                task_id,
+                                                end_time=end_time,
+                                                callback_func=callback_func)
                 logger.info(
                     f'Spot job {self._job_id} (task: {task_id}) SUCCEEDED. '
-                    f'Cleaning up the spot cluster {cluster_name}.')
-                # Only clean up the spot cluster, not the storages, because
-                # tasks may share storages.
+                    f'Cleaning up the cluster {cluster_name}.')
+                # Only clean up the cluster, not the storages, because tasks may
+                # share storages.
                 recovery_strategy.terminate_cluster(cluster_name=cluster_name)
                 return True
 
@@ -224,7 +225,8 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
             # healthy cluster. We can safely continue monitoring.
             # For multi-node jobs, since the job may not be set to FAILED
             # immediately (depending on user program) when only some of the
-            # nodes are preempted, need to check the actual cluster status.
+            # nodes are preempted or failed, need to check the actual cluster
+            # status.
             if (job_status is not None and not job_status.is_terminal() and
                     task.num_nodes == 1):
                 continue
@@ -235,21 +237,28 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 # Add a grace period before the check of preemption to avoid
                 # false alarm for job failure.
                 time.sleep(5)
+
             # Pull the actual cluster status from the cloud provider to
-            # determine whether the cluster is preempted.
+            # determine whether the cluster is preempted or failed.
+            # TODO(zhwu): For hardware failure, such as GPU failure, it may not
+            # be reflected in the cluster status, depending on the cloud, which
+            # can also cause failure of the job, and we need to recover it
+            # rather than fail immediately.
             (cluster_status,
              handle) = backend_utils.refresh_cluster_status_handle(
                  cluster_name,
                  force_refresh_statuses=set(status_lib.ClusterStatus))
 
             if cluster_status != status_lib.ClusterStatus.UP:
-                # The cluster is (partially) preempted. It can be down, INIT
-                # or STOPPED, based on the interruption behavior of the cloud.
-                # Spot recovery is needed (will be done later in the code).
+                # The cluster is (partially) preempted or failed. It can be
+                # down, INIT or STOPPED, based on the interruption behavior of
+                # the cloud. Spot recovery is needed (will be done later in the
+                # code).
                 cluster_status_str = ('' if cluster_status is None else
                                       f' (status: {cluster_status.value})')
                 logger.info(
-                    f'Cluster is preempted{cluster_status_str}. Recovering...')
+                    f'Cluster is preempted or failed{cluster_status_str}. '
+                    'Recovering...')
             else:
                 if job_status is not None and not job_status.is_terminal():
                     # The multi-node job is still running, continue monitoring.
@@ -258,27 +267,29 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
                 ]:
                     # The user code has probably crashed, fail immediately.
-                    end_time = spot_utils.get_job_timestamp(self._backend,
-                                                            cluster_name,
-                                                            get_end_time=True)
+                    end_time = managed_job_utils.get_job_timestamp(
+                        self._backend, cluster_name, get_end_time=True)
                     logger.info(
                         'The user job failed. Please check the logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
 
                     self._download_log_and_stream(handle)
-                    spot_status_to_set = spot_state.SpotStatus.FAILED
+                    managed_job_status = (
+                        managed_job_state.ManagedJobStatus.FAILED)
                     if job_status == job_lib.JobStatus.FAILED_SETUP:
-                        spot_status_to_set = spot_state.SpotStatus.FAILED_SETUP
+                        managed_job_status = (
+                            managed_job_state.ManagedJobStatus.FAILED_SETUP)
                     failure_reason = (
                         'To see the details, run: '
-                        f'sky spot logs --controller {self._job_id}')
-
-                    spot_state.set_failed(self._job_id,
-                                          task_id,
-                                          failure_type=spot_status_to_set,
-                                          failure_reason=failure_reason,
-                                          end_time=end_time,
-                                          callback_func=callback_func)
+                        f'sky jobs logs --controller {self._job_id}')
+
+                    managed_job_state.set_failed(
+                        self._job_id,
+                        task_id,
+                        failure_type=managed_job_status,
+                        failure_reason=failure_reason,
+                        end_time=end_time,
+                        callback_func=callback_func)
                     return False
                 # Although the cluster is healthy, we fail to access the
                 # job status. Try to recover the job (will not restart the
@@ -292,22 +303,24 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
             if handle is not None:
                 resources = handle.launched_resources
                 assert resources is not None, handle
-                if resources.need_cleanup_after_preemption():
+                if resources.need_cleanup_after_preemption_or_failure():
                     # Some spot resource (e.g., Spot TPU VM) may need to be
-                    # cleaned up after preemption.
-                    logger.info('Cleaning up the preempted spot cluster...')
+                    # cleaned up after preemption, as running launch again on
+                    # those clusters again may fail.
+                    logger.info('Cleaning up the preempted or failed cluster'
+                                '...')
                     recovery_strategy.terminate_cluster(cluster_name)
 
-            # Try to recover the spot jobs, when the cluster is preempted
-            # or the job status is failed to be fetched.
-            spot_state.set_recovering(job_id=self._job_id,
-                                      task_id=task_id,
-                                      callback_func=callback_func)
+            # Try to recover the managed jobs, when the cluster is preempted or
+            # failed or the job status is failed to be fetched.
+            managed_job_state.set_recovering(job_id=self._job_id,
+                                             task_id=task_id,
+                                             callback_func=callback_func)
             recovered_time = self._strategy_executor.recover()
-            spot_state.set_recovered(self._job_id,
-                                     task_id,
-                                     recovered_time=recovered_time,
-                                     callback_func=callback_func)
+            managed_job_state.set_recovered(self._job_id,
+                                            task_id,
+                                            recovered_time=recovered_time,
+                                            callback_func=callback_func)
 
     def run(self):
         """Run controller logic and handle exceptions."""
@@ -326,27 +339,29 @@ def run(self):
                 common_utils.format_exception(reason, use_bracket=True)
                 for reason in e.reasons))
             logger.error(failure_reason)
-            spot_state.set_failed(
+            managed_job_state.set_failed(
                 self._job_id,
                 task_id=task_id,
-                failure_type=spot_state.SpotStatus.FAILED_PRECHECKS,
+                failure_type=managed_job_state.ManagedJobStatus.
+                FAILED_PRECHECKS,
                 failure_reason=failure_reason,
-                callback_func=spot_utils.event_callback_func(
+                callback_func=managed_job_utils.event_callback_func(
                     job_id=self._job_id,
                     task_id=task_id,
                     task=self._dag.tasks[task_id]))
-        except exceptions.SpotJobReachedMaxRetriesError as e:
+        except exceptions.ManagedJobReachedMaxRetriesError as e:
             # Please refer to the docstring of self._run for the cases when
             # this exception can occur.
             logger.error(common_utils.format_exception(e))
-            # The spot job should be marked as FAILED_NO_RESOURCE, as the
-            # spot job may be able to launch next time.
-            spot_state.set_failed(
+            # The managed job should be marked as FAILED_NO_RESOURCE, as the
+            # managed job may be able to launch next time.
+            managed_job_state.set_failed(
                 self._job_id,
                 task_id=task_id,
-                failure_type=spot_state.SpotStatus.FAILED_NO_RESOURCE,
+                failure_type=managed_job_state.ManagedJobStatus.
+                FAILED_NO_RESOURCE,
                 failure_reason=common_utils.format_exception(e),
-                callback_func=spot_utils.event_callback_func(
+                callback_func=managed_job_utils.event_callback_func(
                     job_id=self._job_id,
                     task_id=task_id,
                     task=self._dag.tasks[task_id]))
@@ -356,12 +371,13 @@ def run(self):
             msg = ('Unexpected error occurred: '
                    f'{common_utils.format_exception(e, use_bracket=True)}')
             logger.error(msg)
-            spot_state.set_failed(
+            managed_job_state.set_failed(
                 self._job_id,
                 task_id=task_id,
-                failure_type=spot_state.SpotStatus.FAILED_CONTROLLER,
+                failure_type=managed_job_state.ManagedJobStatus.
+                FAILED_CONTROLLER,
                 failure_reason=msg,
-                callback_func=spot_utils.event_callback_func(
+                callback_func=managed_job_utils.event_callback_func(
                     job_id=self._job_id,
                     task_id=task_id,
                     task=self._dag.tasks[task_id]))
@@ -370,27 +386,28 @@ def run(self):
             # affect the jobs in terminal states.
             # We need to call set_cancelling before set_cancelled to make sure
             # the table entries are correctly set.
-            callback_func = spot_utils.event_callback_func(
+            callback_func = managed_job_utils.event_callback_func(
                 job_id=self._job_id,
                 task_id=task_id,
                 task=self._dag.tasks[task_id])
-            spot_state.set_cancelling(job_id=self._job_id,
-                                      callback_func=callback_func)
-            spot_state.set_cancelled(job_id=self._job_id,
-                                     callback_func=callback_func)
+            managed_job_state.set_cancelling(job_id=self._job_id,
+                                             callback_func=callback_func)
+            managed_job_state.set_cancelled(job_id=self._job_id,
+                                            callback_func=callback_func)
 
 
 def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
     """Runs the controller in a remote process for interruption."""
     # The controller needs to be instantiated in the remote process, since
     # the controller is not serializable.
-    spot_controller = SpotController(job_id, dag_yaml, retry_until_up)
-    spot_controller.run()
+    jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
+    jobs_controller.run()
 
 
 def _handle_signal(job_id):
     """Handle the signal if the user sent it."""
-    signal_file = pathlib.Path(spot_utils.SIGNAL_FILE_PREFIX.format(job_id))
+    signal_file = pathlib.Path(
+        managed_job_utils.SIGNAL_FILE_PREFIX.format(job_id))
     user_signal = None
     if signal_file.exists():
         # Filelock is needed to prevent race condition with concurrent
@@ -399,7 +416,7 @@ def _handle_signal(job_id):
             with signal_file.open(mode='r', encoding='utf-8') as f:
                 user_signal = f.read().strip()
                 try:
-                    user_signal = spot_utils.UserSignal(user_signal)
+                    user_signal = managed_job_utils.UserSignal(user_signal)
                 except ValueError:
                     logger.warning(
                         f'Unknown signal received: {user_signal}. Ignoring.')
@@ -409,28 +426,29 @@ def _handle_signal(job_id):
     if user_signal is None:
         # None or empty string.
         return
-    assert user_signal == spot_utils.UserSignal.CANCEL, (
+    assert user_signal == managed_job_utils.UserSignal.CANCEL, (
         f'Only cancel signal is supported, but {user_signal} got.')
-    raise exceptions.SpotUserCancelledError(
+    raise exceptions.ManagedJobUserCancelledError(
         f'User sent {user_signal.value} signal.')
 
 
 def _cleanup(job_id: int, dag_yaml: str):
-    """Clean up the spot cluster(s) and storages.
+    """Clean up the cluster(s) and storages.
 
     (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
         to be cleaned up after the whole job is finished, as the tasks
         may share the same storage.
-    (2) Clean up the spot cluster(s) that are not cleaned up yet, which
-        can happen when the spot task failed or cancelled. At most one
-        spot cluster should be left when reaching here, as we currently
-        only support chain DAGs, and only spot task is executed at a time.
+    (2) Clean up the cluster(s) that are not cleaned up yet, which can happen
+        when the task failed or cancelled. At most one cluster should be left
+        when reaching here, as we currently only support chain DAGs, and only
+        task is executed at a time.
     """
     # NOTE: The code to get cluster name is same as what we did in the spot
-    # controller, we should keep it in sync with SpotController.__init__()
+    # controller, we should keep it in sync with JobsController.__init__()
     dag, _ = _get_dag_and_name(dag_yaml)
     for task in dag.tasks:
-        cluster_name = spot_utils.generate_spot_cluster_name(task.name, job_id)
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+            task.name, job_id)
         recovery_strategy.terminate_cluster(cluster_name)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
@@ -457,16 +475,15 @@ def start(job_id, dag_yaml, retry_until_up):
         while controller_process.is_alive():
             _handle_signal(job_id)
             time.sleep(1)
-    except exceptions.SpotUserCancelledError:
+    except exceptions.ManagedJobUserCancelledError:
         dag, _ = _get_dag_and_name(dag_yaml)
-        task_id, _ = (spot_state.get_latest_task_id_status(job_id))
+        task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
         logger.info(
-            f'Cancelling spot job, job_id: {job_id}, task_id: {task_id}')
-        spot_state.set_cancelling(job_id=job_id,
-                                  callback_func=spot_utils.event_callback_func(
-                                      job_id=job_id,
-                                      task_id=task_id,
-                                      task=dag.tasks[task_id]))
+            f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
+        managed_job_state.set_cancelling(
+            job_id=job_id,
+            callback_func=managed_job_utils.event_callback_func(
+                job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
         cancelling = True
     finally:
         if controller_process is not None:
@@ -480,37 +497,38 @@ def start(job_id, dag_yaml, retry_until_up):
             controller_process.join()
             logger.info(f'Controller process {controller_process.pid} killed.')
 
-        logger.info(f'Cleaning up any spot cluster for job {job_id}.')
+        logger.info(f'Cleaning up any cluster for job {job_id}.')
         # NOTE: Originally, we send an interruption signal to the controller
         # process and the controller process handles cleanup. However, we
         # figure out the behavior differs from cloud to cloud
         # (e.g., GCP ignores 'SIGINT'). A possible explanation is
         # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
         # But anyway, a clean solution is killing the controller process
-        # directly, and then cleanup the cluster state.
+        # directly, and then cleanup the cluster job_state.
         _cleanup(job_id, dag_yaml=dag_yaml)
-        logger.info(f'Spot cluster of job {job_id} has been cleaned up.')
+        logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
 
         if cancelling:
-            spot_state.set_cancelled(
+            managed_job_state.set_cancelled(
                 job_id=job_id,
-                callback_func=spot_utils.event_callback_func(
+                callback_func=managed_job_utils.event_callback_func(
                     job_id=job_id, task_id=task_id, task=dag.tasks[task_id]))
 
         # We should check job status after 'set_cancelled', otherwise
         # the job status is not terminal.
-        job_status = spot_state.get_status(job_id)
+        job_status = managed_job_state.get_status(job_id)
         assert job_status is not None
         # The job can be non-terminal if the controller exited abnormally,
         # e.g. failed to launch cluster after reaching the MAX_RETRY.
         if not job_status.is_terminal():
-            logger.info(f'Previous spot job status: {job_status.value}')
-            spot_state.set_failed(
+            logger.info(f'Previous job status: {job_status.value}')
+            managed_job_state.set_failed(
                 job_id,
                 task_id=None,
-                failure_type=spot_state.SpotStatus.FAILED_CONTROLLER,
+                failure_type=managed_job_state.ManagedJobStatus.
+                FAILED_CONTROLLER,
                 failure_reason=('Unexpected error occurred. For details, '
-                                f'run: sky spot logs --controller {job_id}'))
+                                f'run: sky jobs logs --controller {job_id}'))
 
 
 if __name__ == '__main__':
@@ -521,10 +539,10 @@ def start(job_id, dag_yaml, retry_until_up):
                         help='Job id for the controller job.')
     parser.add_argument('--retry-until-up',
                         action='store_true',
-                        help='Retry until the spot cluster is up.')
+                        help='Retry until the cluster is up.')
     parser.add_argument('dag_yaml',
                         type=str,
-                        help='The path to the user spot task yaml file.')
+                        help='The path to the user job yaml file.')
     args = parser.parse_args()
     # We start process with 'spawn', because 'fork' could result in weird
     # behaviors; 'spawn' is also cross-platform.
diff --git a/sky/spot/core.py b/sky/jobs/core.py
similarity index 68%
rename from sky/spot/core.py
rename to sky/jobs/core.py
index 61673e4728f..ff9953489d5 100644
--- a/sky/spot/core.py
+++ b/sky/jobs/core.py
@@ -1,4 +1,4 @@
-"""SDK functions for managed spot job."""
+"""SDK functions for managed jobs."""
 import os
 import tempfile
 from typing import Any, Dict, List, Optional, Union
@@ -14,9 +14,9 @@
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.clouds.service_catalog import common as service_catalog_common
+from sky.jobs import constants as managed_job_constants
+from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants as skylet_constants
-from sky.spot import constants
-from sky.spot import spot_utils
 from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import controller_utils
@@ -35,18 +35,19 @@ def launch(
     retry_until_up: bool = False,
 ) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
-    """Launch a managed spot job.
+    """Launch a managed job.
 
-    Please refer to the sky.cli.spot_launch for the document.
+    Please refer to sky.cli.job_launch for documentation.
 
     Args:
         task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
-          managed spot job.
-        name: Name of the spot job.
+          managed job.
+        name: Name of the managed job.
         detach_run: Whether to detach the run.
 
     Raises:
-        ValueError: cluster does not exist.
+        ValueError: cluster does not exist. Or, the entrypoint is not a valid
+            chain dag.
         sky.exceptions.NotSupportedError: the feature is not supported.
     """
     entrypoint = task
@@ -55,8 +56,8 @@ def launch(
     dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
     if not dag.is_chain():
         with ux_utils.print_exception_no_traceback():
-            raise ValueError('Only single-task or chain DAG is allowed for '
-                             f'sky.spot.launch. Dag:\n{dag}')
+            raise ValueError('Only single-task or chain DAG is '
+                             f'allowed for job_launch. Dag: {dag}')
 
     dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
 
@@ -71,28 +72,29 @@ def launch(
                     'will be auto-generated) .')
         task_names.add(task_.name)
 
-    dag_utils.fill_default_spot_config_in_dag_for_spot_launch(dag)
+    dag_utils.fill_default_config_in_dag_for_job_launch(dag)
 
     for task_ in dag.tasks:
         controller_utils.maybe_translate_local_file_mounts_and_sync_up(
-            task_, path='spot')
+            task_, path='jobs')
 
-    with tempfile.NamedTemporaryFile(prefix=f'spot-dag-{dag.name}-',
+    with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
                                      mode='w') as f:
         dag_utils.dump_chain_dag_to_yaml(dag, f.name)
-        controller_name = spot_utils.SPOT_CONTROLLER_NAME
-        prefix = constants.SPOT_TASK_YAML_PREFIX
+        controller = controller_utils.Controllers.JOBS_CONTROLLER
+        controller_name = controller.value.cluster_name
+        prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
         remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
         remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
         controller_resources = controller_utils.get_controller_resources(
-            controller_type='spot',
+            controller=controller_utils.Controllers.JOBS_CONTROLLER,
             task_resources=sum([list(t.resources) for t in dag.tasks], []))
 
         vars_to_fill = {
             'remote_user_yaml_path': remote_user_yaml_path,
             'user_yaml_path': f.name,
-            'spot_controller': controller_name,
-            # Note: actual spot cluster name will be <task.name>-<spot job ID>
+            'jobs_controller': controller_name,
+            # Note: actual cluster name will be <task.name>-<managed job ID>
             'dag_name': dag.name,
             'retry_until_up': retry_until_up,
             'remote_user_config_path': remote_user_config_path,
@@ -100,27 +102,29 @@ def launch(
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),
             **controller_utils.shared_controller_vars_to_fill(
-                'spot',
+                controller_utils.Controllers.JOBS_CONTROLLER,
                 remote_user_config_path=remote_user_config_path,
             ),
         }
 
-        yaml_path = os.path.join(constants.SPOT_CONTROLLER_YAML_PREFIX,
-                                 f'{name}-{dag_uuid}.yaml')
-        common_utils.fill_template(constants.SPOT_CONTROLLER_TEMPLATE,
-                                   vars_to_fill,
-                                   output_path=yaml_path)
+        yaml_path = os.path.join(
+            managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
+            f'{name}-{dag_uuid}.yaml')
+        common_utils.fill_template(
+            managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
+            vars_to_fill,
+            output_path=yaml_path)
         controller_task = task_lib.Task.from_yaml(yaml_path)
         controller_task.set_resources(controller_resources)
 
-        controller_task.spot_dag = dag
-        assert len(controller_task.resources) == 1
+        controller_task.managed_job_dag = dag
+        assert len(controller_task.resources) == 1, controller_task
 
         sky_logging.print(
             f'{colorama.Fore.YELLOW}'
-            f'Launching managed spot job {dag.name!r} from spot controller...'
+            f'Launching managed job {dag.name!r} from jobs controller...'
             f'{colorama.Style.RESET_ALL}')
-        sky_logging.print('Launching spot controller...')
+        sky_logging.print('Launching jobs controller...')
         sky.launch(task=controller_task,
                    stream_logs=stream_logs,
                    cluster_name=controller_name,
@@ -134,9 +138,9 @@ def launch(
 @usage_lib.entrypoint
 def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
-    """Get statuses of managed spot jobs.
+    """Get statuses of managed jobs.
 
-    Please refer to the sky.cli.spot_queue for the documentation.
+    Please refer to sky.cli.job_queue for documentation.
 
     Returns:
         [
@@ -148,23 +152,23 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
                 'end_at': (float) timestamp of end,
                 'duration': (float) duration in seconds,
                 'recovery_count': (int) Number of retries,
-                'status': (sky.spot.SpotStatus) of the job,
+                'status': (sky.jobs.ManagedJobStatus) of the job,
                 'cluster_resources': (str) resources of the cluster,
                 'region': (str) region of the cluster,
             }
         ]
     Raises:
-        sky.exceptions.ClusterNotUpError: the spot controller is not up or
+        sky.exceptions.ClusterNotUpError: the jobs controller is not up or
             does not exist.
-        RuntimeError: if failed to get the spot jobs with ssh.
+        RuntimeError: if failed to get the managed jobs with ssh.
     """
+    jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
     stopped_message = ''
     if not refresh:
-        stopped_message = 'No in-progress spot jobs.'
+        stopped_message = 'No in-progress managed jobs.'
     try:
         handle = backend_utils.is_controller_accessible(
-            controller_type=controller_utils.Controllers.SPOT_CONTROLLER,
-            stopped_message=stopped_message)
+            controller=jobs_controller_type, stopped_message=stopped_message)
     except exceptions.ClusterNotUpError as e:
         if not refresh:
             raise
@@ -176,18 +180,19 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
                           'Restarting controller for latest status...'
                           f'{colorama.Style.RESET_ALL}')
 
-        rich_utils.force_update_status('[cyan] Checking spot jobs - restarting '
-                                       'controller[/]')
-        handle = sky.start(spot_utils.SPOT_CONTROLLER_NAME)
+        rich_utils.force_update_status(
+            '[cyan] Checking managed jobs - restarting '
+            'controller[/]')
+        handle = sky.start(jobs_controller_type.value.cluster_name)
         controller_status = status_lib.ClusterStatus.UP
-        rich_utils.force_update_status('[cyan] Checking spot jobs[/]')
+        rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
 
     assert handle is not None, (controller_status, refresh)
 
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
 
-    code = spot_utils.SpotCodeGen.get_job_table()
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
     returncode, job_table_payload, stderr = backend.run_on_head(
         handle,
         code,
@@ -198,13 +203,13 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
     try:
         subprocess_utils.handle_returncode(returncode,
                                            code,
-                                           'Failed to fetch managed spot jobs',
+                                           'Failed to fetch managed jobs',
                                            job_table_payload + stderr,
                                            stream_logs=False)
     except exceptions.CommandError as e:
         raise RuntimeError(str(e)) from e
 
-    jobs = spot_utils.load_spot_job_queue(job_table_payload)
+    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
     if skip_finished:
         # Filter out the finished jobs. If a multi-task job is partially
         # finished, we will include all its tasks.
@@ -222,18 +227,18 @@ def cancel(name: Optional[str] = None,
            job_ids: Optional[List[int]] = None,
            all: bool = False) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
-    """Cancel managed spot jobs.
+    """Cancel managed jobs.
 
-    Please refer to the sky.cli.spot_cancel for the document.
+    Please refer to sky.cli.job_cancel for documentation.
 
     Raises:
-        sky.exceptions.ClusterNotUpError: the spot controller is not up.
+        sky.exceptions.ClusterNotUpError: the jobs controller is not up.
         RuntimeError: failed to cancel the job.
     """
     job_ids = [] if job_ids is None else job_ids
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SPOT_CONTROLLER,
-        stopped_message='All managed spot jobs should have finished.')
+        controller=controller_utils.Controllers.JOBS_CONTROLLER,
+        stopped_message='All managed jobs should have finished.')
 
     job_id_str = ','.join(map(str, job_ids))
     if sum([len(job_ids) > 0, name is not None, all]) != 1:
@@ -247,12 +252,12 @@ def cancel(name: Optional[str] = None,
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
     if all:
-        code = spot_utils.SpotCodeGen.cancel_jobs_by_id(None)
+        code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
     elif job_ids:
-        code = spot_utils.SpotCodeGen.cancel_jobs_by_id(job_ids)
+        code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
     else:
         assert name is not None, (job_ids, name, all)
-        code = spot_utils.SpotCodeGen.cancel_job_by_name(name)
+        code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
     # The stderr is redirected to stdout
     returncode, stdout, _ = backend.run_on_head(handle,
                                                 code,
@@ -260,7 +265,7 @@ def cancel(name: Optional[str] = None,
                                                 stream_logs=False)
     try:
         subprocess_utils.handle_returncode(returncode, code,
-                                           'Failed to cancel managed spot job',
+                                           'Failed to cancel managed job',
                                            stdout)
     except exceptions.CommandError as e:
         with ux_utils.print_exception_no_traceback():
@@ -276,42 +281,49 @@ def cancel(name: Optional[str] = None,
 @usage_lib.entrypoint
 def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
-    """Tail logs of managed spot jobs.
+    """Tail logs of managed jobs.
 
-    Please refer to the sky.cli.spot_logs for the document.
+    Please refer to sky.cli.job_logs for documentation.
 
     Raises:
         ValueError: invalid arguments.
-        sky.exceptions.ClusterNotUpError: the spot controller is not up.
+        sky.exceptions.ClusterNotUpError: the jobs controller is not up.
     """
-    # TODO(zhwu): Automatically restart the spot controller
+    # TODO(zhwu): Automatically restart the jobs controller
+    jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SPOT_CONTROLLER,
-        stopped_message=('Please restart the spot controller with '
-                         f'`sky start {spot_utils.SPOT_CONTROLLER_NAME}`.'))
+        controller=jobs_controller_type,
+        stopped_message=(
+            'Please restart the jobs controller with '
+            f'`sky start {jobs_controller_type.value.cluster_name}`.'))
 
     if name is not None and job_id is not None:
         raise ValueError('Cannot specify both name and job_id.')
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend), backend
     # Stream the realtime logs
-    backend.tail_spot_logs(handle, job_id=job_id, job_name=name, follow=follow)
-
-
-spot_launch = common_utils.deprecated_function(launch,
-                                               name='sky.spot.launch',
-                                               deprecated_name='spot_launch',
-                                               removing_version='0.7.0')
+    backend.tail_managed_job_logs(handle,
+                                  job_id=job_id,
+                                  job_name=name,
+                                  follow=follow)
+
+
+spot_launch = common_utils.deprecated_function(
+    launch,
+    name='sky.jobs.launch',
+    deprecated_name='spot_launch',
+    removing_version='0.8.0',
+    override_argument={'use_spot': True})
 spot_queue = common_utils.deprecated_function(queue,
-                                              name='sky.spot.queue',
+                                              name='sky.jobs.queue',
                                               deprecated_name='spot_queue',
-                                              removing_version='0.7.0')
+                                              removing_version='0.8.0')
 spot_cancel = common_utils.deprecated_function(cancel,
-                                               name='sky.spot.cancel',
+                                               name='sky.jobs.cancel',
                                                deprecated_name='spot_cancel',
-                                               removing_version='0.7.0')
+                                               removing_version='0.8.0')
 spot_tail_logs = common_utils.deprecated_function(
     tail_logs,
-    name='sky.spot.tail_logs',
+    name='sky.jobs.tail_logs',
     deprecated_name='spot_tail_logs',
-    removing_version='0.7.0')
+    removing_version='0.8.0')
diff --git a/sky/jobs/dashboard/dashboard.py b/sky/jobs/dashboard/dashboard.py
new file mode 100644
index 00000000000..89c97274646
--- /dev/null
+++ b/sky/jobs/dashboard/dashboard.py
@@ -0,0 +1,87 @@
+"""Dashboard for managed jobs based on Flask.
+
+TODO(zongheng): This is a basic version. In the future we can beef up the web
+frameworks used (e.g.,
+https://github.com/ray-project/ray/tree/master/dashboard/client/src) and/or get
+rid of the SSH port-forwarding business (see cli.py's job_dashboard()
+comment).
+"""
+import datetime
+import pathlib
+
+import flask
+import yaml
+
+from sky import jobs as managed_jobs
+from sky.utils import common_utils
+from sky.utils import controller_utils
+
+app = flask.Flask(__name__)
+
+
+def _is_running_on_jobs_controller() -> bool:
+    """Am I running on jobs controller?
+
+    Loads ~/.sky/sky_ray.yml and check cluster_name.
+    """
+    if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
+        config = yaml.safe_load(
+            pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text())
+        cluster_name = config.get('cluster_name', '')
+        candidate_controller_names = (
+            controller_utils.Controllers.JOBS_CONTROLLER.value.
+            candidate_cluster_names)
+        # We use startswith instead of exact match because the cluster name in
+        # the yaml file is cluster_name_on_cloud which may have additional
+        # suffices.
+        return any(
+            cluster_name.startswith(name)
+            for name in candidate_controller_names)
+    return False
+
+
+@app.route('/')
+def home():
+    if not _is_running_on_jobs_controller():
+        # Experimental: run on laptop (refresh is very slow).
+        all_managed_jobs = managed_jobs.queue(refresh=True, skip_finished=False)
+    else:
+        job_table = managed_jobs.dump_managed_job_queue()
+        all_managed_jobs = managed_jobs.load_managed_job_queue(job_table)
+
+    timestamp = datetime.datetime.now(datetime.timezone.utc)
+    rows = managed_jobs.format_job_table(all_managed_jobs,
+                                         show_all=True,
+                                         return_rows=True)
+    # Add an empty column for the dropdown button. This will be added in the
+    # jobs/templates/index.html file.
+    rows = [[''] + row for row in rows]
+
+    # FIXME(zongheng): make the job table/queue funcs return structured info so
+    # that we don't have to do things like row[-5] below.
+    columns = [
+        '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
+        'Job Duration', 'Recoveries', 'Status', 'Started', 'Cluster', 'Region',
+        'Failure'
+    ]
+    if rows and len(rows[0]) != len(columns):
+        raise RuntimeError(
+            'Dashboard code and managed job queue code are out of sync.')
+
+    # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'.
+    for row in rows:
+        row[-5] = common_utils.remove_color(row[-5])
+    # Remove filler rows ([''], ..., ['-']).
+    rows = [row for row in rows if ''.join(map(str, row)) != '']
+
+    rendered_html = flask.render_template(
+        'index.html',
+        columns=columns,
+        rows=rows,
+        last_updated_timestamp=timestamp,
+    )
+    return rendered_html
+
+
+if __name__ == '__main__':
+    app.run()
diff --git a/sky/spot/dashboard/static/favicon.ico b/sky/jobs/dashboard/static/favicon.ico
similarity index 100%
rename from sky/spot/dashboard/static/favicon.ico
rename to sky/jobs/dashboard/static/favicon.ico
diff --git a/sky/spot/dashboard/templates/index.html b/sky/jobs/dashboard/templates/index.html
similarity index 50%
rename from sky/spot/dashboard/templates/index.html
rename to sky/jobs/dashboard/templates/index.html
index f8267cd3e5f..af4f5708bce 100644
--- a/sky/spot/dashboard/templates/index.html
+++ b/sky/jobs/dashboard/templates/index.html
@@ -6,7 +6,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <link rel="icon" href="{{ url_for('static', filename='favicon.ico') }}" type="image/x-icon">
     <title>SkyPilot Dashboard</title>
-    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/css/bootstrap.min.css" integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/css/bootstrap.min.css"
+        integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">
     <style>
         .table-no-stripes tbody tr:nth-of-type(even) {
             background-color: transparent;
@@ -40,6 +41,9 @@
             /* Replace with your desired text color */
             z-index: 1;
         }
+        .clickable {
+           cursor: pointer; /* This makes the cursor a pointer when hovering over the element */
+        }
     </style>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.29.1/moment.min.js"></script>
     <script
@@ -49,18 +53,16 @@
 
 <body>
     <div class="container">
-
         <header>
-            <h1>Managed spot jobs</h1>
+            <h1>Managed jobs</h1>
             <p class="text-muted mt-4" id="last-updated"></p>
-
             <div class="form-check form-switch">
                 <input class="form-check-input" type="checkbox" id="refresh-toggle" checked>
                 <label class="form-check-label" for="refresh-toggle">Auto-refresh (every 30s)</label>
             </div>
         </header>
 
-        <table class="table table-hover table-hover-selected fixed-header-table">
+        <table class="table table-hover table-hover-selected fixed-header-table" id="jobs-table">
             <thead>
                 <tr>
                     {% for column in columns %}
@@ -70,43 +72,102 @@ <h1>Managed spot jobs</h1>
             </thead>
             <tbody>
                 {% for row in rows %}
-                <tr>
-                    <td>{{ row[0] }}</td>
-                    <td>{{ row[1] }}</td>
+                <tr class="{% if row[2] == '' %}folder{% endif %}{% if row[1] == ' \u21B3' %}folded{% endif %}">
+                    <td>{% if row[2] == '' %}<span class="clickable">▶</span>{% endif %}</td>
+                    <td>{{ row[1]|string|replace(' \u21B3', '') }}</td>
                     <td>{{ row[2] }}</td>
                     <td>{{ row[3] }}</td>
                     <td>{{ row[4] }}</td>
                     <td>{{ row[5] }}</td>
                     <td>{{ row[6] }}</td>
                     <td>{{ row[7] }}</td>
+                    <td>{{ row[8] }}</td>
                     <td>
                         <!-- https://getbootstrap.com/docs/4.0/components/badge/ -->
-                        {% if row[8] == 'RUNNING' %}
-                        <span class="badge bg-primary">{{ row[8] }}</span>
-                        {% elif row[8] == 'PENDING' or row[8] == 'SUBMITTED' %}
-                        <span class="badge bg-light badge-light">{{ row[8] }}</span>
-                        {% elif row[8] == 'RECOVERING' or row[8] == 'CANCELLING' or row[8] == 'STARTING' %}
-                        <!-- transient states, so same color makes sense? -->
-                        <span class="badge bg-info">{{ row[8] }}</span>
-                        {% elif row[8] == 'SUCCEEDED' %}
-                        <span class="badge bg-success">{{ row[8] }}</span>
-                        {% elif row[8] == 'CANCELLED' %}
-                        <span class="badge bg-secondary">{{ row[8] }}</span>
-                        {% elif row[8].startswith('FAILED') %}
-                        <span class="badge bg-warning">{{ row[8] }}</span>
+                        {% if row[9].startswith('RUNNING') %}
+                        <span class="badge bg-primary">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
+                        {% elif row[9].startswith('PENDING') or row[9].startswith('SUBMITTED') %}
+                        <span class="badge bg-light">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
+                        {% elif row[9].startswith('RECOVERING') or row[9].startswith('CANCELLING') or row[9].startswith('STARTING') %}
+                        <span class="badge bg-info">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
+                        {% elif row[9].startswith('SUCCEEDED') %}
+                        <span class="badge bg-success">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
+                        {% elif row[9].startswith('CANCELLED') %}
+                        <span class="badge bg-secondary">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
+                        {% elif row[9].startswith('FAILED') %}
+                        <span class="badge bg-warning">{{ row[9].split()[0] }}</span>{{ row[9][row[9].split()[0]|length:] }}
                         {% else %}
-                        {{ row[8] }}
+                        {{ row[9] }}
                         {% endif %}
                     </td>
-                    <td>{{ row[9] }}</td>
                     <td>{{ row[10] }}</td>
                     <td>{{ row[11] }}</td>
                     <td>{{ row[12] }}</td>
+                    <td>{{ row[13] }}</td>
                 </tr>
                 {% endfor %}
             </tbody>
         </table>
     </div>
+    <script>
+        // Folder toggle for pipelines, this will fold/unfold the rows for
+        // a pipeline and its tasks.
+        document.querySelectorAll('.folder').forEach(function (folder) {
+            folder.addEventListener('click', function () {
+                let plusSign = folder.querySelector('.clickable');
+                let folderIndex = Array.from(document.querySelectorAll('.folder')).indexOf(folder);
+
+                if (!plusSign) {
+                    plusSign = document.createElement('span');
+                    plusSign.classList.add('clickable');
+                    folder.children[0].prepend(plusSign); // Ensure the clickable span is the first child of the folder cell
+                }
+
+                let displayState = 'none';
+                if (plusSign.textContent === '▶') {
+                    plusSign.textContent = '▼'; // Change to downward-pointing triangle
+                    displayState = '';
+                    localStorage.setItem('folderState' + folderIndex, 'expanded');
+                } else {
+                    plusSign.textContent = '▶'; // Change back to right-pointing triangle
+                    localStorage.setItem('folderState' + folderIndex, 'collapsed');
+                }
+
+                let next = folder.nextElementSibling;
+                while (next && next.classList.contains('folded')) {
+                    next.style.display = displayState;
+                    next = next.nextElementSibling;
+                }
+            });
+        });
+
+        document.addEventListener("DOMContentLoaded", function () {
+            // Setting initial fold state for each folder
+            document.querySelectorAll('.folder').forEach(function (folder, index) {
+                let storedState = localStorage.getItem('folderState' + index);
+                let plusSign = folder.querySelector('.clickable');
+                let next = folder.nextElementSibling;
+                if (storedState === 'expanded') {
+                    plusSign.textContent = '▼';
+                    while (next && next.classList.contains('folded')) {
+                        next.style.display = '';  // Show
+                        next = next.nextElementSibling;
+                    }
+                } else {
+                    plusSign.textContent = '▶';
+                    while (next && next.classList.contains('folded')) {
+                        next.style.display = 'none';  // Hide initially
+                        next = next.nextElementSibling;
+                    }
+                }
+            });
+            document.querySelectorAll('.folded').forEach(function (folded) {
+                if (!folded.previousElementSibling.querySelector('.clickable').textContent.includes('▼')) {
+                    folded.style.display = 'none'; // Make sure these rows are initially hidden unless state is expanded
+                }
+            });
+        });
+    </script>
     <script>
         document.addEventListener("DOMContentLoaded", function () {
             var timestamp = "{{ last_updated_timestamp }}";  // Get the UTC timestamp from the template
diff --git a/sky/spot/recovery_strategy.py b/sky/jobs/recovery_strategy.py
similarity index 88%
rename from sky/spot/recovery_strategy.py
rename to sky/jobs/recovery_strategy.py
index 897899e55f8..2a32aa3b24e 100644
--- a/sky/spot/recovery_strategy.py
+++ b/sky/jobs/recovery_strategy.py
@@ -1,9 +1,9 @@
-"""The strategy to handle launching/recovery/termination of spot clusters.
+"""Strategies to handle launching/recovery/termination of managed job clusters.
 
-In the YAML file, the user can specify the strategy to use for spot jobs.
+In the YAML file, the user can specify the strategy to use for managed jobs.
 
 resources:
-    spot_recovery: EAGER_NEXT_REGION
+    job_recovery: EAGER_NEXT_REGION
 """
 import time
 import traceback
@@ -17,8 +17,8 @@
 from sky import sky_logging
 from sky import status_lib
 from sky.backends import backend_utils
+from sky.jobs import utils as managed_job_utils
 from sky.skylet import job_lib
-from sky.spot import spot_utils
 from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import ux_utils
@@ -28,8 +28,8 @@
 
 logger = sky_logging.init_logger(__name__)
 
-SPOT_STRATEGIES = {}
-SPOT_DEFAULT_STRATEGY = None
+RECOVERY_STRATEGIES = {}
+DEFAULT_RECOVERY_STRATEGY = None
 
 # Waiting time for job from INIT/PENDING to RUNNING
 # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
@@ -37,7 +37,7 @@
 
 
 def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
-    """Terminate the spot cluster."""
+    """Terminate the cluster."""
     retry_cnt = 0
     while True:
         try:
@@ -50,17 +50,17 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
-                raise RuntimeError('Failed to terminate the spot cluster '
-                                   f'{cluster_name}.') from e
-            logger.error('Failed to terminate the spot cluster '
-                         f'{cluster_name}. Retrying.'
-                         f'Details: {common_utils.format_exception(e)}')
+                raise RuntimeError(
+                    f'Failed to terminate the cluster {cluster_name}.') from e
+            logger.error(
+                f'Failed to terminate the cluster {cluster_name}. Retrying.'
+                f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
                 logger.error(f'  Traceback: {traceback.format_exc()}')
 
 
 class StrategyExecutor:
-    """Handle each launching, recovery and termination of the spot clusters."""
+    """Handle the launching, recovery and termination of managed job clusters"""
 
     RETRY_INIT_GAP_SECONDS = 60
 
@@ -83,12 +83,12 @@ def __init__(self, cluster_name: str, backend: 'backends.Backend',
         self.retry_until_up = retry_until_up
 
     def __init_subclass__(cls, name: str, default: bool = False):
-        SPOT_STRATEGIES[name] = cls
+        RECOVERY_STRATEGIES[name] = cls
         if default:
-            global SPOT_DEFAULT_STRATEGY
-            assert SPOT_DEFAULT_STRATEGY is None, (
+            global DEFAULT_RECOVERY_STRATEGY
+            assert DEFAULT_RECOVERY_STRATEGY is None, (
                 'Only one strategy can be default.')
-            SPOT_DEFAULT_STRATEGY = name
+            DEFAULT_RECOVERY_STRATEGY = name
 
     @classmethod
     def make(cls, cluster_name: str, backend: 'backends.Backend',
@@ -96,23 +96,23 @@ def make(cls, cluster_name: str, backend: 'backends.Backend',
         """Create a strategy from a task."""
 
         resource_list = list(task.resources)
-        spot_recovery = resource_list[0].spot_recovery
+        job_recovery = resource_list[0].job_recovery
         for resource in resource_list:
-            if resource.spot_recovery != spot_recovery:
+            if resource.job_recovery != job_recovery:
                 raise ValueError(
-                    'The spot recovery strategy should be the same for all '
+                    'The job recovery strategy should be the same for all '
                     'resources.')
-        # Remove the spot_recovery field from the resources, as the strategy
+        # Remove the job_recovery field from the resources, as the strategy
         # will be handled by the strategy class.
-        new_resources_list = [r.copy(spot_recovery=None) for r in resource_list]
+        new_resources_list = [r.copy(job_recovery=None) for r in resource_list]
         # set the new_task_resources to be the same type (list or set) as the
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
-        return SPOT_STRATEGIES[spot_recovery](cluster_name, backend, task,
-                                              retry_until_up)
+        return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
+                                                 retry_until_up)
 
     def launch(self) -> float:
-        """Launch the spot cluster for the first time.
+        """Launch the cluster for the first time.
 
         It can fail if resource is not available. Need to check the cluster
         status, after calling.
@@ -131,7 +131,7 @@ def launch(self) -> float:
         return job_submit_at
 
     def recover(self) -> float:
-        """Relaunch the spot cluster after failure and wait until job starts.
+        """Relaunch the cluster after failure and wait until job starts.
 
         When recover() is called the cluster should be in STOPPED status (i.e.
         partially down).
@@ -214,8 +214,8 @@ def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
                 break
 
             try:
-                status = spot_utils.get_job_status(self.backend,
-                                                   self.cluster_name)
+                status = managed_job_utils.get_job_status(
+                    self.backend, self.cluster_name)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -230,7 +230,7 @@ def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
             # Check the job status until it is not in initialized status
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
-                    job_submitted_at = spot_utils.get_job_timestamp(
+                    job_submitted_at = managed_job_utils.get_job_timestamp(
                         self.backend, self.cluster_name, get_end_time=False)
                     return job_submitted_at
                 except Exception as e:  # pylint: disable=broad-except
@@ -240,7 +240,7 @@ def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
                                 'the job start timestamp. Retrying.')
                     continue
             # Wait for the job to be started
-            time.sleep(spot_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
+            time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
         return None
 
     def _launch(self,
@@ -293,8 +293,8 @@ def _launch(self,
                            cluster_name=self.cluster_name,
                            detach_setup=True,
                            detach_run=True,
-                           _is_launched_by_spot_controller=True)
-                logger.info('Spot cluster launched.')
+                           _is_launched_by_jobs_controller=True)
+                logger.info('Managed job cluster launched.')
             except (exceptions.InvalidClusterNameError,
                     exceptions.NoCloudAccessError,
                     exceptions.ResourcesMismatchError) as e:
@@ -330,12 +330,12 @@ def _launch(self,
                         raise exceptions.ProvisionPrechecksError(
                             reasons=reasons)
                     return None
-                logger.info('Failed to launch the spot cluster with error: '
+                logger.info('Failed to launch a cluster with error: '
                             f'{common_utils.format_exception(e)})')
             except Exception as e:  # pylint: disable=broad-except
                 # If the launch fails, it will be recovered by the following
                 # code.
-                logger.info('Failed to launch the spot cluster with error: '
+                logger.info('Failed to launch a cluster with error: '
                             f'{common_utils.format_exception(e)})')
                 with ux_utils.enable_traceback():
                     logger.info(f'  Traceback: {traceback.format_exc()}')
@@ -345,7 +345,7 @@ def _launch(self,
                 job_submitted_at = self._wait_until_job_starts_on_cluster()
                 if job_submitted_at is not None:
                     return job_submitted_at
-                # The job fails to start on the spot cluster, retry the launch.
+                # The job fails to start on the cluster, retry the launch.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
                 logger.info(
@@ -358,13 +358,13 @@ def _launch(self,
                 # Retry forever if max_retry is None.
                 if raise_on_failure:
                     with ux_utils.print_exception_no_traceback():
-                        raise exceptions.SpotJobReachedMaxRetriesError(
-                            'Resources unavailable: failed to launch the spot '
-                            f'cluster after {max_retry} retries.')
+                        raise exceptions.ManagedJobReachedMaxRetriesError(
+                            'Resources unavailable: failed to launch clusters '
+                            f'after {max_retry} retries.')
                 else:
                     return None
             gap_seconds = backoff.current_backoff()
-            logger.info('Retrying to launch the spot cluster in '
+            logger.info('Retrying to launch the cluster in '
                         f'{gap_seconds:.1f} seconds.')
             time.sleep(gap_seconds)
 
@@ -429,8 +429,8 @@ def recover(self) -> float:
                     return job_submitted_at
 
             # Step 2
-            logger.debug('Terminating unhealthy spot cluster and '
-                         'reset cloud region.')
+            logger.debug('Terminating unhealthy cluster and reset cloud '
+                         'region.')
             terminate_cluster(self.cluster_name)
 
             # Step 3
@@ -443,13 +443,13 @@ def recover(self) -> float:
                 # Failed to launch the cluster.
                 if self.retry_until_up:
                     gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the spot cluster in '
+                    logger.info('Retrying to recover the cluster in '
                                 f'{gap_seconds:.1f} seconds.')
                     time.sleep(gap_seconds)
                     continue
                 with ux_utils.print_exception_no_traceback():
                     raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the spot cluster after retrying '
+                        f'Failed to recover the cluster after retrying '
                         f'{self._MAX_RETRY_CNT} times.')
 
             return job_submitted_at
@@ -493,8 +493,7 @@ def recover(self) -> float:
         # task.resources.
 
         # Step 1
-        logger.debug('Terminating unhealthy spot cluster and '
-                     'reset cloud region.')
+        logger.debug('Terminating unhealthy cluster and reset cloud region.')
         terminate_cluster(self.cluster_name)
 
         # Step 2
@@ -532,13 +531,13 @@ def recover(self) -> float:
                 # Failed to launch the cluster.
                 if self.retry_until_up:
                     gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the spot cluster in '
+                    logger.info('Retrying to recover the cluster in '
                                 f'{gap_seconds:.1f} seconds.')
                     time.sleep(gap_seconds)
                     continue
                 with ux_utils.print_exception_no_traceback():
                     raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the spot cluster after retrying '
+                        f'Failed to recover the cluster after retrying '
                         f'{self._MAX_RETRY_CNT} times.')
 
             return job_submitted_at
diff --git a/sky/spot/spot_state.py b/sky/jobs/state.py
similarity index 76%
rename from sky/spot/spot_state.py
rename to sky/jobs/state.py
index 58b0099c3d2..6ea68da59f8 100644
--- a/sky/spot/spot_state.py
+++ b/sky/jobs/state.py
@@ -1,4 +1,4 @@
-"""The database for spot jobs status."""
+"""The database for managed jobs status."""
 # TODO(zhwu): maybe use file based status instead of database, so
 # that we can easily switch to a s3-based storage.
 import enum
@@ -30,9 +30,11 @@
 _CONN = sqlite3.connect(_DB_PATH)
 _CURSOR = _CONN.cursor()
 
+# === Database schema ===
 # `spot` table contains all the finest-grained tasks, including all the
-# tasks of a spot job. All tasks of the same job will have the same
-# `spot_job_id`.
+# tasks of a managed job (called spot for legacy reason, as it is generalized
+# from the previous managed spot jobs). All tasks of the same job will have the
+# same `spot_job_id`.
 # The `job_name` column is now deprecated. It now holds the task's name, i.e.,
 # the same content as the `task_name` column.
 # The `job_id` is now not really a job id, but a only a unique
@@ -60,7 +62,7 @@
 
 db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
 # Create a new column `spot_job_id`, which is the same for tasks of the
-# same spot job.
+# same managed job.
 # The original `job_id` no longer has an actual meaning, but only a legacy
 # identifier for all tasks in database.
 db_utils.add_column_to_table(_CURSOR,
@@ -99,7 +101,7 @@
 # total_job_duration = end_at - last_recovered_at + job_duration
 #
 # Column names to be used in the jobs dict returned to the caller,
-# e.g., via sky spot queue. These may not correspond to actual
+# e.g., via sky jobs queue. These may not correspond to actual
 # column names in the DB and it corresponds to the combined view
 # by joining the spot and job_info tables.
 columns = [
@@ -124,21 +126,21 @@
 ]
 
 
-class SpotStatus(enum.Enum):
-    """Spot job status, designed to be in serverless style.
+class ManagedJobStatus(enum.Enum):
+    """Managed job status, designed to be in serverless style.
 
-    The SpotStatus is a higher level status than the JobStatus.
-    Each spot job submitted to the spot cluster, will have a JobStatus
-    on that spot cluster:
+    The ManagedJobStatus is a higher level status than the JobStatus.
+    Each managed job submitted to a cluster will have a JobStatus
+    on that cluster:
         JobStatus = [INIT, SETTING_UP, PENDING, RUNNING, ...]
-    Whenever the spot cluster is preempted and recovered, the JobStatus
+    Whenever the cluster is preempted and recovered, the JobStatus
     will go through the statuses above again.
-    That means during the lifetime of a spot job, its JobsStatus could be
+    That means during the lifetime of a managed job, its JobsStatus could be
     reset to INIT or SETTING_UP multiple times (depending on the preemptions).
 
-    However, a spot job only has one SpotStatus on the spot controller.
-        SpotStatus = [PENDING, SUBMITTED, STARTING, RUNNING, ...]
-    Mapping from JobStatus to SpotStatus:
+    However, a managed job only has one ManagedJobStatus on the jobs controller.
+        ManagedJobStatus = [PENDING, SUBMITTED, STARTING, RUNNING, ...]
+    Mapping from JobStatus to ManagedJobStatus:
         INIT            ->  STARTING/RECOVERING
         SETTING_UP      ->  RUNNING
         PENDING         ->  RUNNING
@@ -146,38 +148,37 @@ class SpotStatus(enum.Enum):
         SUCCEEDED       ->  SUCCEEDED
         FAILED          ->  FAILED
         FAILED_SETUP    ->  FAILED_SETUP
-    Note that the JobStatus will not be stuck in PENDING, because each spot
-    cluster is dedicated to a spot job, i.e. there should always be enough
-    resource to run the job and the job will be immediately transitioned to
-    RUNNING.
+    Note that the JobStatus will not be stuck in PENDING, because each cluster
+    is dedicated to a managed job, i.e. there should always be enough resource
+    to run the job and the job will be immediately transitioned to RUNNING.
     """
-    # PENDING: Waiting for the spot controller to have a slot to run the
+    # PENDING: Waiting for the jobs controller to have a slot to run the
     # controller process.
-    # The submitted_at timestamp of the spot job in the 'spot' table will be
+    # The submitted_at timestamp of the managed job in the 'spot' table will be
     # set to the time when the job is firstly submitted by the user (set to
     # PENDING).
     PENDING = 'PENDING'
-    # SUBMITTED: The spot controller starts the controller process.
+    # SUBMITTED: The jobs controller starts the controller process.
     SUBMITTED = 'SUBMITTED'
-    # STARTING: The controller process is launching the spot cluster for
-    # the spot job.
+    # STARTING: The controller process is launching the cluster for the managed
+    # job.
     STARTING = 'STARTING'
-    # RUNNING: The job is submitted to the spot cluster, and is setting up
-    # or running.
-    # The start_at timestamp of the spot job in the 'spot' table will be set
+    # RUNNING: The job is submitted to the cluster, and is setting up or
+    # running.
+    # The start_at timestamp of the managed job in the 'spot' table will be set
     # to the time when the job is firstly transitioned to RUNNING.
     RUNNING = 'RUNNING'
-    # RECOVERING: The spot cluster is preempted, and the controller process
-    # is recovering the spot cluster (relaunching/failover).
+    # RECOVERING: The cluster is preempted, and the controller process is
+    # recovering the cluster (relaunching/failover).
     RECOVERING = 'RECOVERING'
     # Terminal statuses
     # SUCCEEDED: The job is finished successfully.
     SUCCEEDED = 'SUCCEEDED'
     # CANCELLING: The job is requested to be cancelled by the user, and the
-    # controller is cleaning up the spot cluster.
+    # controller is cleaning up the cluster.
     CANCELLING = 'CANCELLING'
-    # CANCELLED: The job is cancelled by the user. When the spot job is in
-    # CANCELLED status, the spot cluster has been cleaned up.
+    # CANCELLED: The job is cancelled by the user. When the managed job is in
+    # CANCELLED status, the cluster has been cleaned up.
     CANCELLED = 'CANCELLED'
     # FAILED: The job is finished with failure from the user's program.
     FAILED = 'FAILED'
@@ -192,7 +193,7 @@ class SpotStatus(enum.Enum):
     #    identity, or unsupported feature.
     FAILED_PRECHECKS = 'FAILED_PRECHECKS'
     # FAILED_NO_RESOURCE: The job is finished with failure because there is no
-    # resource available in the cloud provider(s) to launch the spot cluster.
+    # resource available in the cloud provider(s) to launch the cluster.
     FAILED_NO_RESOURCE = 'FAILED_NO_RESOURCE'
     # FAILED_CONTROLLER: The job is finished with failure because of unexpected
     # error in the controller process.
@@ -204,15 +205,16 @@ def is_terminal(self) -> bool:
     def is_failed(self) -> bool:
         return self in self.failure_statuses()
 
-    def colored_str(self):
+    def colored_str(self) -> str:
         color = _SPOT_STATUS_TO_COLOR[self]
         return f'{color}{self.value}{colorama.Style.RESET_ALL}'
 
-    def __lt__(self, other):
-        return list(SpotStatus).index(self) < list(SpotStatus).index(other)
+    def __lt__(self, other) -> bool:
+        status_list = list(ManagedJobStatus)
+        return status_list.index(self) < status_list.index(other)
 
     @classmethod
-    def terminal_statuses(cls) -> List['SpotStatus']:
+    def terminal_statuses(cls) -> List['ManagedJobStatus']:
         return [
             cls.SUCCEEDED,
             cls.FAILED,
@@ -225,7 +227,7 @@ def terminal_statuses(cls) -> List['SpotStatus']:
         ]
 
     @classmethod
-    def failure_statuses(cls) -> List['SpotStatus']:
+    def failure_statuses(cls) -> List['ManagedJobStatus']:
         return [
             cls.FAILED, cls.FAILED_SETUP, cls.FAILED_PRECHECKS,
             cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
@@ -233,19 +235,19 @@ def failure_statuses(cls) -> List['SpotStatus']:
 
 
 _SPOT_STATUS_TO_COLOR = {
-    SpotStatus.PENDING: colorama.Fore.BLUE,
-    SpotStatus.SUBMITTED: colorama.Fore.BLUE,
-    SpotStatus.STARTING: colorama.Fore.BLUE,
-    SpotStatus.RUNNING: colorama.Fore.GREEN,
-    SpotStatus.RECOVERING: colorama.Fore.CYAN,
-    SpotStatus.SUCCEEDED: colorama.Fore.GREEN,
-    SpotStatus.FAILED: colorama.Fore.RED,
-    SpotStatus.FAILED_PRECHECKS: colorama.Fore.RED,
-    SpotStatus.FAILED_SETUP: colorama.Fore.RED,
-    SpotStatus.FAILED_NO_RESOURCE: colorama.Fore.RED,
-    SpotStatus.FAILED_CONTROLLER: colorama.Fore.RED,
-    SpotStatus.CANCELLING: colorama.Fore.YELLOW,
-    SpotStatus.CANCELLED: colorama.Fore.YELLOW,
+    ManagedJobStatus.PENDING: colorama.Fore.BLUE,
+    ManagedJobStatus.SUBMITTED: colorama.Fore.BLUE,
+    ManagedJobStatus.STARTING: colorama.Fore.BLUE,
+    ManagedJobStatus.RUNNING: colorama.Fore.GREEN,
+    ManagedJobStatus.RECOVERING: colorama.Fore.CYAN,
+    ManagedJobStatus.SUCCEEDED: colorama.Fore.GREEN,
+    ManagedJobStatus.FAILED: colorama.Fore.RED,
+    ManagedJobStatus.FAILED_PRECHECKS: colorama.Fore.RED,
+    ManagedJobStatus.FAILED_SETUP: colorama.Fore.RED,
+    ManagedJobStatus.FAILED_NO_RESOURCE: colorama.Fore.RED,
+    ManagedJobStatus.FAILED_CONTROLLER: colorama.Fore.RED,
+    ManagedJobStatus.CANCELLING: colorama.Fore.YELLOW,
+    ManagedJobStatus.CANCELLED: colorama.Fore.YELLOW,
 }
 
 
@@ -268,7 +270,7 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
             (spot_job_id, task_id, task_name, resources, status)
             VALUES (?, ?, ?, ?, ?)""",
             (job_id, task_id, task_name, resources_str,
-             SpotStatus.PENDING.value))
+             ManagedJobStatus.PENDING.value))
 
 
 def set_submitted(job_id: int, task_id: int, run_timestamp: str,
@@ -277,18 +279,18 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
     """Set the task to submitted.
 
     Args:
-        job_id: The spot job ID.
+        job_id: The managed job ID.
         task_id: The task ID.
         run_timestamp: The run_timestamp of the run. This will be used to
-            determine the log directory of the spot task.
-        submit_time: The time when the spot task is submitted.
-        resources_str: The resources string of the spot task.
+            determine the log directory of the managed task.
+        submit_time: The time when the managed task is submitted.
+        resources_str: The resources string of the managed task.
     """
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
     # make it easier to find them based on one of the values.
     # Also, using the earlier timestamp should be closer to the term
-    # `submit_at`, which represents the time the spot task is submitted.
+    # `submit_at`, which represents the time the managed task is submitted.
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -299,7 +301,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             run_timestamp=(?)
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
-            (resources_str, submit_time, SpotStatus.SUBMITTED.value,
+            (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
              run_timestamp, job_id, task_id))
     callback_func('SUBMITTED')
 
@@ -312,7 +314,7 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
             """\
             UPDATE spot SET status=(?)
             WHERE spot_job_id=(?) AND
-            task_id=(?)""", (SpotStatus.STARTING.value, job_id, task_id))
+            task_id=(?)""", (ManagedJobStatus.STARTING.value, job_id, task_id))
     callback_func('STARTING')
 
 
@@ -326,7 +328,14 @@ def set_started(job_id: int, task_id: int, start_time: float,
             UPDATE spot SET status=(?), start_at=(?), last_recovered_at=(?)
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
-            (SpotStatus.RUNNING.value, start_time, start_time, job_id, task_id))
+            (
+                ManagedJobStatus.RUNNING.value,
+                start_time,
+                start_time,
+                job_id,
+                task_id,
+            ),
+        )
     callback_func('STARTED')
 
 
@@ -340,7 +349,7 @@ def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
                 status=(?), job_duration=job_duration+(?)-last_recovered_at
                 WHERE spot_job_id=(?) AND
                 task_id=(?)""",
-            (SpotStatus.RECOVERING.value, time.time(), job_id, task_id))
+            (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id))
     callback_func('RECOVERING')
 
 
@@ -354,7 +363,7 @@ def set_recovered(job_id: int, task_id: int, recovered_time: float,
             status=(?), last_recovered_at=(?), recovery_count=recovery_count+1
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
-            (SpotStatus.RUNNING.value, recovered_time, job_id, task_id))
+            (ManagedJobStatus.RUNNING.value, recovered_time, job_id, task_id))
     logger.info('==== Recovered. ====')
     callback_func('RECOVERED')
 
@@ -369,7 +378,7 @@ def set_succeeded(job_id: int, task_id: int, end_time: float,
             status=(?), end_at=(?)
             WHERE spot_job_id=(?) AND task_id=(?)
             AND end_at IS null""",
-            (SpotStatus.SUCCEEDED.value, end_time, job_id, task_id))
+            (ManagedJobStatus.SUCCEEDED.value, end_time, job_id, task_id))
 
     callback_func('SUCCEEDED')
     logger.info('Job succeeded.')
@@ -378,7 +387,7 @@ def set_succeeded(job_id: int, task_id: int, end_time: float,
 def set_failed(
     job_id: int,
     task_id: Optional[int],
-    failure_type: SpotStatus,
+    failure_type: ManagedJobStatus,
     failure_reason: str,
     callback_func: Optional[CallbackType] = None,
     end_time: Optional[float] = None,
@@ -389,7 +398,7 @@ def set_failed(
         job_id: The job id.
         task_id: The task id. If None, all non-finished tasks of the job will
             be set to failed.
-        failure_type: The failure type. One of SpotStatus.FAILED_*.
+        failure_type: The failure type. One of ManagedJobStatus.FAILED_*.
         failure_reason: The failure reason.
         end_time: The end time. If None, the current time will be used.
     """
@@ -405,8 +414,8 @@ def set_failed(
         previous_status = cursor.execute(
             'SELECT status FROM spot WHERE spot_job_id=(?)',
             (job_id,)).fetchone()
-        previous_status = SpotStatus(previous_status[0])
-        if previous_status in [SpotStatus.RECOVERING]:
+        previous_status = ManagedJobStatus(previous_status[0])
+        if previous_status in [ManagedJobStatus.RECOVERING]:
             # If the job is recovering, we should set the
             # last_recovered_at to the end_time, so that the
             # end_at - last_recovered_at will not be affect the job duration
@@ -438,7 +447,7 @@ def set_cancelling(job_id: int, callback_func: CallbackType):
             UPDATE spot SET
             status=(?), end_at=(?)
             WHERE spot_job_id=(?) AND end_at IS null""",
-            (SpotStatus.CANCELLING.value, time.time(), job_id))
+            (ManagedJobStatus.CANCELLING.value, time.time(), job_id))
         if rows.rowcount > 0:
             logger.info('Cancelling the job...')
             callback_func('CANCELLING')
@@ -455,8 +464,8 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
             UPDATE spot SET
             status=(?), end_at=(?)
             WHERE spot_job_id=(?) AND status=(?)""",
-            (SpotStatus.CANCELLED.value, time.time(), job_id,
-             SpotStatus.CANCELLING.value))
+            (ManagedJobStatus.CANCELLED.value, time.time(), job_id,
+             ManagedJobStatus.CANCELLING.value))
         if rows.rowcount > 0:
             logger.info('Job cancelled.')
             callback_func('CANCELLED')
@@ -465,8 +474,10 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
 # ======== utility functions ========
 def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
     """Get non-terminal job ids by name."""
-    statuses = ', '.join(['?'] * len(SpotStatus.terminal_statuses()))
-    field_values = [status.value for status in SpotStatus.terminal_statuses()]
+    statuses = ', '.join(['?'] * len(ManagedJobStatus.terminal_statuses()))
+    field_values = [
+        status.value for status in ManagedJobStatus.terminal_statuses()
+    ]
 
     name_filter = ''
     if name is not None:
@@ -478,7 +489,7 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
         field_values.extend([name, name])
 
     # Left outer join is used here instead of join, because the job_info does
-    # not contain the spot jobs submitted before #1982.
+    # not contain the managed jobs submitted before #1982.
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(
             f"""\
@@ -494,14 +505,15 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
         return job_ids
 
 
-def _get_all_task_ids_statuses(job_id: int) -> List[Tuple[int, SpotStatus]]:
+def _get_all_task_ids_statuses(
+        job_id: int) -> List[Tuple[int, ManagedJobStatus]]:
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         id_statuses = cursor.execute(
             """\
             SELECT task_id, status FROM spot
             WHERE spot_job_id=(?)
             ORDER BY task_id ASC""", (job_id,)).fetchall()
-        return [(row[0], SpotStatus(row[1])) for row in id_statuses]
+        return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
 
 
 def get_num_tasks(job_id: int) -> int:
@@ -509,13 +521,13 @@ def get_num_tasks(job_id: int) -> int:
 
 
 def get_latest_task_id_status(
-        job_id: int) -> Union[Tuple[int, SpotStatus], Tuple[None, None]]:
+        job_id: int) -> Union[Tuple[int, ManagedJobStatus], Tuple[None, None]]:
     """Returns the (task id, status) of the latest task of a job.
 
-    The latest means the task that is currently being executed or to be
-    started by the controller process. For example, in a spot job with
-    3 tasks, the first task is succeeded, and the second task is being
-    executed. This will return (1, SpotStatus.RUNNING).
+    The latest means the task that is currently being executed or to be started
+    by the controller process. For example, in a managed job with 3 tasks, the
+    first task is succeeded, and the second task is being executed. This will
+    return (1, ManagedJobStatus.RUNNING).
 
     If the job_id does not exist, (None, None) will be returned.
     """
@@ -529,7 +541,7 @@ def get_latest_task_id_status(
     return task_id, status
 
 
-def get_status(job_id: int) -> Optional[SpotStatus]:
+def get_status(job_id: int) -> Optional[ManagedJobStatus]:
     _, status = get_latest_task_id_status(job_id)
     return status
 
@@ -551,14 +563,14 @@ def get_failure_reason(job_id: int) -> Optional[str]:
         return reason[0]
 
 
-def get_spot_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
-    """Get spot clusters' status."""
+def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
+    """Get managed jobs from the database."""
     job_filter = '' if job_id is None else f'WHERE spot.spot_job_id={job_id}'
 
-    # Join the spot and job_info tables to get the job name for each task.
+    # Join spot and job_info tables to get the job name for each task.
     # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
     # existing controller before #1982, the job_info table may not exist,
-    # and all the spot jobs created before will not present in the
+    # and all the managed jobs created before will not present in the
     # job_info.
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(f"""\
@@ -571,7 +583,7 @@ def get_spot_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         jobs = []
         for row in rows:
             job_dict = dict(zip(columns, row))
-            job_dict['status'] = SpotStatus(job_dict['status'])
+            job_dict['status'] = ManagedJobStatus(job_dict['status'])
             if job_dict['job_name'] is None:
                 job_dict['job_name'] = job_dict['task_name']
             jobs.append(job_dict)
diff --git a/sky/spot/spot_utils.py b/sky/jobs/utils.py
similarity index 66%
rename from sky/spot/spot_utils.py
rename to sky/jobs/utils.py
index 67ac0cf4ae8..8220dd01621 100644
--- a/sky/spot/spot_utils.py
+++ b/sky/jobs/utils.py
@@ -1,10 +1,16 @@
-"""User interfaces with managed spot jobs."""
+"""User interfaces with managed jobs.
+
+NOTE: whenever an API change is made in this file, we need to bump the
+jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
+ManagedJobCodeGen.
+"""
 import collections
 import enum
-import json
 import os
 import pathlib
 import shlex
+import shutil
+import textwrap
 import time
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -18,11 +24,11 @@
 from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
+from sky.jobs import constants as managed_job_constants
+from sky.jobs import state as managed_job_state
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet.log_lib import run_bash_command_with_log
-from sky.spot import constants as spot_constants
-from sky.spot import spot_state
 from sky.utils import common_utils
 from sky.utils import log_utils
 from sky.utils import rich_utils
@@ -36,17 +42,18 @@
 
 # Add user hash so that two users don't have the same controller VM on
 # shared-account clouds such as GCP.
-SPOT_CONTROLLER_NAME: str = (
+JOB_CONTROLLER_NAME: str = (
+    f'sky-jobs-controller-{common_utils.get_user_hash()}')
+LEGACY_JOB_CONTROLLER_NAME: str = (
     f'sky-spot-controller-{common_utils.get_user_hash()}')
-SIGNAL_FILE_PREFIX = '/tmp/sky_spot_controller_signal_{}'
+SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
+LEGACY_SIGNAL_FILE_PREFIX = '/tmp/sky_spot_controller_signal_{}'
 # Controller checks its job's status every this many seconds.
 JOB_STATUS_CHECK_GAP_SECONDS = 20
 
 # Controller checks if its job has started every this many seconds.
 JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
 
-_SPOT_STATUS_CACHE = '~/.sky/spot_status_cache.txt'
-
 _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
 
 _JOB_WAITING_STATUS_MESSAGE = ('[bold cyan]Waiting for the task to start'
@@ -55,11 +62,11 @@
     '[bold cyan]Waiting for the task status to be updated.'
     '[/] It may take a minute.')
 
-# The maximum time to wait for the spot job status to transition to terminal
+# The maximum time to wait for the managed job status to transition to terminal
 # state, after the job finished. This is a safeguard to avoid the case where
-# the spot job status fails to be updated and keep the `sky spot logs` blocking
-# for a long time.
-_FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS = 20
+# the managed job status fails to be updated and keep the `sky jobs logs`
+# blocking for a long time.
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
 
 
 class UserSignal(enum.Enum):
@@ -72,7 +79,7 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
 def get_job_status(backend: 'backends.CloudVmRayBackend',
                    cluster_name: str) -> Optional['job_lib.JobStatus']:
-    """Check the status of the job running on the spot cluster.
+    """Check the status of the job running on a managed job cluster.
 
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_SETUP or CANCELLED.
     """
@@ -93,8 +100,8 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
     return status
 
 
-def update_spot_job_status(job_id: Optional[int] = None):
-    """Update spot job status if the controller job failed abnormally.
+def update_managed_job_status(job_id: Optional[int] = None):
+    """Update managed job status if the controller job failed abnormally.
 
     Check the status of the controller job. If it is not running, it must have
     exited abnormally, and we should set the job status to FAILED_CONTROLLER.
@@ -103,7 +110,7 @@ def update_spot_job_status(job_id: Optional[int] = None):
     is called.
     """
     if job_id is None:
-        job_ids = spot_state.get_nonterminal_job_ids_by_name(None)
+        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
     else:
         job_ids = [job_id]
     for job_id_ in job_ids:
@@ -111,11 +118,12 @@ def update_spot_job_status(job_id: Optional[int] = None):
         if controller_status is None or controller_status.is_terminal():
             logger.error(f'Controller for job {job_id_} has exited abnormally. '
                          'Setting the job status to FAILED_CONTROLLER.')
-            tasks = spot_state.get_spot_jobs(job_id_)
+            tasks = managed_job_state.get_managed_jobs(job_id_)
             for task in tasks:
                 task_name = task['job_name']
-                # Tear down the abnormal spot cluster to avoid resource leakage.
-                cluster_name = generate_spot_cluster_name(task_name, job_id_)
+                # Tear down the abnormal cluster to avoid resource leakage.
+                cluster_name = generate_managed_job_cluster_name(
+                    task_name, job_id_)
                 handle = global_user_state.get_handle_from_cluster_name(
                     cluster_name)
                 if handle is not None:
@@ -126,22 +134,23 @@ def update_spot_job_status(job_id: Optional[int] = None):
                             backend.teardown(handle, terminate=True)
                             break
                         except RuntimeError:
-                            logger.error('Failed to tear down the spot cluster '
+                            logger.error('Failed to tear down the cluster '
                                          f'{cluster_name!r}. Retrying '
                                          f'[{retry_cnt}/{max_retry}].')
 
-            # The controller job for this spot job is not running: it must
+            # The controller job for this managed job is not running: it must
             # have exited abnormally, and we should set the job status to
             # FAILED_CONTROLLER.
             # The `set_failed` will only update the task's status if the
             # status is non-terminal.
-            spot_state.set_failed(
+            managed_job_state.set_failed(
                 job_id_,
                 task_id=None,
-                failure_type=spot_state.SpotStatus.FAILED_CONTROLLER,
+                failure_type=managed_job_state.ManagedJobStatus.
+                FAILED_CONTROLLER,
                 failure_reason=
                 'Controller process has exited abnormally. For more details,'
-                f' run: sky spot logs --controller {job_id_}')
+                f' run: sky jobs logs --controller {job_id_}')
 
 
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
@@ -164,16 +173,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
 def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
     """Run event callback for the task."""
 
-    def callback_func(state: str):
+    def callback_func(status: str):
         event_callback = task.event_callback if task else None
         if event_callback is None or task is None:
             return
         event_callback = event_callback.strip()
-        cluster_name = generate_spot_cluster_name(task.name,
-                                                  job_id) if task.name else None
-        logger.info(f'=== START: event callback for {state!r} ===')
-        log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, 'spot_event',
-                                f'spot-callback-{job_id}-{task_id}.log')
+        cluster_name = generate_managed_job_cluster_name(
+            task.name, job_id) if task.name else None
+        logger.info(f'=== START: event callback for {status!r} ===')
+        log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
+                                'managed_job_event',
+                                f'jobs-callback-{job_id}-{task_id}.log')
         result = run_bash_command_with_log(
             bash_command=event_callback,
             log_path=log_path,
@@ -184,14 +194,14 @@ def callback_func(state: str):
                     task.envs.get(constants.TASK_ID_LIST_ENV_VAR, 'N.A.')),
                 TASK_ID=str(task_id),
                 JOB_ID=str(job_id),
-                JOB_STATUS=state,
+                JOB_STATUS=status,
                 CLUSTER_NAME=cluster_name or '',
                 TASK_NAME=task.name or '',
                 # TODO(MaoZiming): Future event type Job or Spot.
                 EVENT_TYPE='Spot'))
         logger.info(
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
-        logger.info(f'=== END: event callback for {state!r} ===')
+        logger.info(f'=== END: event callback for {status!r} ===')
 
     return callback_func
 
@@ -199,14 +209,14 @@ def callback_func(state: str):
 # ======== user functions ========
 
 
-def generate_spot_cluster_name(task_name: str, job_id: int) -> str:
-    """Generate spot cluster name."""
+def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
+    """Generate managed job cluster name."""
     # Truncate the task name to 30 chars to avoid the cluster name being too
     # long after appending the job id, which will cause another truncation in
     # the underlying sky.launch, hiding the `job_id` in the cluster name.
     cluster_name = common_utils.make_cluster_name_on_cloud(
         task_name,
-        spot_constants.SPOT_CLUSTER_NAME_PREFIX_LENGTH,
+        managed_job_constants.JOBS_CLUSTER_NAME_PREFIX_LENGTH,
         add_user_hash=False)
     return f'{cluster_name}-{job_id}'
 
@@ -217,7 +227,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
     If job_ids is None, cancel all jobs.
     """
     if job_ids is None:
-        job_ids = spot_state.get_nonterminal_job_ids_by_name(None)
+        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
     job_ids = list(set(job_ids))
     if len(job_ids) == 0:
         return 'No job to cancel.'
@@ -225,9 +235,9 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
     logger.info(f'Cancelling jobs {job_id_str}.')
     cancelled_job_ids = []
     for job_id in job_ids:
-        # Check the status of the managed spot job status. If it is in
+        # Check the status of the managed job status. If it is in
         # terminal state, we can safely skip it.
-        job_status = spot_state.get_status(job_id)
+        job_status = managed_job_state.get_status(job_id)
         if job_status is None:
             logger.info(f'Job {job_id} not found. Skipped.')
             continue
@@ -236,18 +246,21 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
                         f'{job_status.value}. Skipped.')
             continue
 
-        update_spot_job_status(job_id)
+        update_managed_job_status(job_id)
 
-        # Send the signal to the spot job controller.
+        # Send the signal to the jobs controller.
         signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
+        legacy_signal_file = pathlib.Path(
+            LEGACY_SIGNAL_FILE_PREFIX.format(job_id))
         # Filelock is needed to prevent race condition between signal
         # check/removal and signal writing.
-        # TODO(mraheja): remove pylint disabling when filelock version updated
-        # pylint: disable=abstract-class-instantiated
         with filelock.FileLock(str(signal_file) + '.lock'):
             with signal_file.open('w', encoding='utf-8') as f:
                 f.write(UserSignal.CANCEL.value)
                 f.flush()
+            # Backward compatibility for managed jobs launched before #3419. It
+            # can be removed in the future 0.8.0 release.
+            shutil.copy(str(signal_file), str(legacy_signal_file))
         cancelled_job_ids.append(job_id)
 
     if len(cancelled_job_ids) == 0:
@@ -262,7 +275,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
 
 def cancel_job_by_name(job_name: str) -> str:
     """Cancel a job by name."""
-    job_ids = spot_state.get_nonterminal_job_ids_by_name(job_name)
+    job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
     if len(job_ids) == 0:
         return f'No running job found with name {job_name!r}.'
     if len(job_ids) > 1:
@@ -279,7 +292,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
     status_msg = ('[bold cyan]Waiting for controller process to be RUNNING'
                   '{status_str}[/].')
     status_display = rich_utils.safe_status(status_msg.format(status_str=''))
-    num_tasks = spot_state.get_num_tasks(job_id)
+    num_tasks = managed_job_state.get_num_tasks(job_id)
 
     with status_display:
         prev_msg = None
@@ -299,43 +312,46 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
         msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
         status_display.update(msg)
         prev_msg = msg
-        spot_job_status = spot_state.get_status(job_id)
-        while spot_job_status is None:
+        managed_job_status = managed_job_state.get_status(job_id)
+        while managed_job_status is None:
             time.sleep(1)
-            spot_job_status = spot_state.get_status(job_id)
+            managed_job_status = managed_job_state.get_status(job_id)
 
-        if spot_job_status.is_terminal():
+        if managed_job_status.is_terminal():
             job_msg = ''
-            if spot_job_status.is_failed():
-                job_msg = (
-                    f'\nFailure reason: {spot_state.get_failure_reason(job_id)}'
-                )
+            if managed_job_status.is_failed():
+                job_msg = ('\nFailure reason: '
+                           f'{managed_job_state.get_failure_reason(job_id)}')
             return (f'{colorama.Fore.YELLOW}'
                     f'Job {job_id} is already in terminal state '
-                    f'{spot_job_status.value}. Logs will not be shown.'
+                    f'{managed_job_status.value}. Logs will not be shown.'
                     f'{colorama.Style.RESET_ALL}{job_msg}')
         backend = backends.CloudVmRayBackend()
-        task_id, spot_status = spot_state.get_latest_task_id_status(job_id)
+        task_id, managed_job_status = (
+            managed_job_state.get_latest_task_id_status(job_id))
 
-        # task_id and spot_status can be None if the controller process just
-        # started and the spot status has not set to PENDING yet.
-        while spot_status is None or not spot_status.is_terminal():
+        # task_id and managed_job_status can be None if the controller process
+        # just started and the managed job status has not set to PENDING yet.
+        while (managed_job_status is None or
+               not managed_job_status.is_terminal()):
             handle = None
             if task_id is not None:
-                task_name = spot_state.get_task_name(job_id, task_id)
-                cluster_name = generate_spot_cluster_name(task_name, job_id)
+                task_name = managed_job_state.get_task_name(job_id, task_id)
+                cluster_name = generate_managed_job_cluster_name(
+                    task_name, job_id)
                 handle = global_user_state.get_handle_from_cluster_name(
                     cluster_name)
 
             # Check the handle: The cluster can be preempted and removed from
-            # the table before the spot state is updated by the controller. In
-            # this case, we should skip the logging, and wait for the next
-            # round of status check.
-            if handle is None or spot_status != spot_state.SpotStatus.RUNNING:
+            # the table before the managed job state is updated by the
+            # controller. In this case, we should skip the logging, and wait for
+            # the next round of status check.
+            if (handle is None or managed_job_status !=
+                    managed_job_state.ManagedJobStatus.RUNNING):
                 status_str = ''
-                if (spot_status is not None and
-                        spot_status != spot_state.SpotStatus.RUNNING):
-                    status_str = f' (status: {spot_status.value})'
+                if (managed_job_status is not None and managed_job_status !=
+                        managed_job_state.ManagedJobStatus.RUNNING):
+                    status_str = f' (status: {managed_job_status.value})'
                 logger.debug(
                     f'INFO: The log is not ready yet{status_str}. '
                     f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
@@ -344,21 +360,21 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                     status_display.update(msg)
                     prev_msg = msg
                 time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
-                task_id, spot_status = (
-                    spot_state.get_latest_task_id_status(job_id))
+                task_id, managed_job_status = (
+                    managed_job_state.get_latest_task_id_status(job_id))
                 continue
-            assert spot_status is not None
+            assert managed_job_status is not None
             assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
             status_display.stop()
             returncode = backend.tail_logs(handle,
                                            job_id=None,
-                                           spot_job_id=job_id,
+                                           managed_job_id=job_id,
                                            follow=follow)
             if returncode == 0:
                 # If the log tailing exit successfully (the real job can be
                 # SUCCEEDED or FAILED), we can safely break the loop. We use the
-                # status in job queue to show the information, as the spot_state
-                # is not updated yet.
+                # status in job queue to show the information, as the
+                # ManagedJobStatus is not updated yet.
                 job_statuses = backend.get_job_status(handle, stream_logs=False)
                 job_status = list(job_statuses.values())[0]
                 assert job_status is not None, 'No job found.'
@@ -376,8 +392,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                         status_display.start()
                         original_task_id = task_id
                         while True:
-                            task_id, spot_status = (
-                                spot_state.get_latest_task_id_status(job_id))
+                            task_id, managed_job_status = (
+                                managed_job_state.get_latest_task_id_status(
+                                    job_id))
                             if original_task_id != task_id:
                                 break
                             time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
@@ -393,46 +410,47 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                 logger.debug(
                     f'INFO: (Log streaming) Got return code {returncode}. '
                     f'Retrying in {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
-            # Finish early if the spot status is already in terminal state.
-            spot_status = spot_state.get_status(job_id)
-            assert spot_status is not None, job_id
-            if spot_status.is_terminal():
+            # Finish early if the managed job status is already in terminal
+            # state.
+            managed_job_status = managed_job_state.get_status(job_id)
+            assert managed_job_status is not None, job_id
+            if managed_job_status.is_terminal():
                 break
-            logger.info(f'{colorama.Fore.YELLOW}The job is preempted.'
-                        f'{colorama.Style.RESET_ALL}')
+            logger.info(f'{colorama.Fore.YELLOW}The job cluster is preempted '
+                        f'or failed.{colorama.Style.RESET_ALL}')
             msg = _JOB_CANCELLED_MESSAGE
             status_display.update(msg)
             prev_msg = msg
             status_display.start()
             # If the tailing fails, it is likely that the cluster fails, so we
-            # wait a while to make sure the spot state is updated by the
-            # controller, and check the spot queue again.
+            # wait a while to make sure the managed job state is updated by the
+            # controller, and check the managed job queue again.
             # Wait a bit longer than the controller, so as to make sure the
-            # spot state is updated.
+            # managed job state is updated.
             time.sleep(3 * JOB_STATUS_CHECK_GAP_SECONDS)
-            spot_status = spot_state.get_status(job_id)
+            managed_job_status = managed_job_state.get_status(job_id)
 
-    # The spot_status may not be in terminal status yet, since the controllerhas
-    # not updated the spot state yet. We wait for a while, until the spot state
-    # is updated.
+    # The managed_job_status may not be in terminal status yet, since the
+    # controller has not updated the managed job state yet. We wait for a while,
+    # until the managed job state is updated.
     wait_seconds = 0
-    spot_status = spot_state.get_status(job_id)
-    assert spot_status is not None, job_id
-    while (not spot_status.is_terminal() and follow and
-           wait_seconds < _FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS):
+    managed_job_status = managed_job_state.get_status(job_id)
+    assert managed_job_status is not None, job_id
+    while (not managed_job_status.is_terminal() and follow and
+           wait_seconds < _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS):
         time.sleep(1)
         wait_seconds += 1
-        spot_status = spot_state.get_status(job_id)
-        assert spot_status is not None, job_id
+        managed_job_status = managed_job_state.get_status(job_id)
+        assert managed_job_status is not None, job_id
 
     logger.info(f'Logs finished for job {job_id} '
-                f'(status: {spot_status.value}).')
+                f'(status: {managed_job_status.value}).')
     return ''
 
 
 def stream_logs_by_name(job_name: str, follow: bool = True) -> str:
     """Stream logs by name."""
-    job_ids = spot_state.get_nonterminal_job_ids_by_name(job_name)
+    job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
     if len(job_ids) == 0:
         return (f'{colorama.Fore.RED}No job found with name {job_name!r}.'
                 f'{colorama.Style.RESET_ALL}')
@@ -444,8 +462,8 @@ def stream_logs_by_name(job_name: str, follow: bool = True) -> str:
     return ''
 
 
-def dump_spot_job_queue() -> str:
-    jobs = spot_state.get_spot_jobs()
+def dump_managed_job_queue() -> str:
+    jobs = managed_job_state.get_managed_jobs()
 
     for job in jobs:
         end_at = job['end_at']
@@ -453,7 +471,7 @@ def dump_spot_job_queue() -> str:
             end_at = time.time()
 
         job_submitted_at = job['last_recovered_at'] - job['job_duration']
-        if job['status'] == spot_state.SpotStatus.RECOVERING:
+        if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
             # When job is recovering, the duration is exact job['job_duration']
             job_duration = job['job_duration']
         elif job_submitted_at > 0:
@@ -465,8 +483,8 @@ def dump_spot_job_queue() -> str:
         job['job_duration'] = job_duration
         job['status'] = job['status'].value
 
-        cluster_name = generate_spot_cluster_name(job['task_name'],
-                                                  job['job_id'])
+        cluster_name = generate_managed_job_cluster_name(
+            job['task_name'], job['job_id'])
         handle = global_user_state.get_handle_from_cluster_name(cluster_name)
         if handle is not None:
             assert isinstance(handle, backends.CloudVmRayResourceHandle)
@@ -481,14 +499,34 @@ def dump_spot_job_queue() -> str:
     return common_utils.encode_payload(jobs)
 
 
-def load_spot_job_queue(payload: str) -> List[Dict[str, Any]]:
+def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
     """Load job queue from json string."""
     jobs = common_utils.decode_payload(payload)
     for job in jobs:
-        job['status'] = spot_state.SpotStatus(job['status'])
+        job['status'] = managed_job_state.ManagedJobStatus(job['status'])
     return jobs
 
 
+def _get_job_status_from_tasks(
+    job_tasks: List[Dict[str, Any]]
+) -> Tuple[managed_job_state.ManagedJobStatus, int]:
+    """Get the current task status and the current task id for a job."""
+    managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
+    current_task_id = 0
+    for task in job_tasks:
+        managed_task_status = task['status']
+        current_task_id = task['task_id']
+
+        # Use the first non-succeeded status.
+        if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
+            # TODO(zhwu): we should not blindly use the first non-
+            # succeeded as the status could be changed to SUBMITTED
+            # when going from one task to the next one, which can be
+            # confusing.
+            break
+    return managed_task_status, current_task_id
+
+
 @typing.overload
 def format_job_table(tasks: List[Dict[str, Any]],
                      show_all: bool,
@@ -510,18 +548,36 @@ def format_job_table(
         show_all: bool,
         return_rows: bool = False,
         max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
-    """Returns spot jobs as a formatted string.
+    """Returns managed jobs as a formatted string.
 
     Args:
-        jobs: A list of spot jobs.
+        jobs: A list of managed jobs.
         show_all: Whether to show all columns.
         max_jobs: The maximum number of jobs to show in the table.
         return_rows: If True, return the rows as a list of strings instead of
           all rows concatenated into a single string.
 
-    Returns: A formatted string of spot jobs, if not `return_rows`; otherwise a
-      list of "rows" (each of which is a list of str).
+    Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
+      a list of "rows" (each of which is a list of str).
     """
+    jobs = collections.defaultdict(list)
+    for task in tasks:
+        # The tasks within the same job_id are already sorted
+        # by the task_id.
+        jobs[task['job_id']].append(task)
+    jobs = dict(jobs)
+
+    status_counts: Dict[str, int] = collections.defaultdict(int)
+    for job_tasks in jobs.values():
+        managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
+        if not managed_job_status.is_terminal():
+            status_counts[managed_job_status.value] += 1
+
+    if max_jobs is not None:
+        job_ids = sorted(jobs.keys(), reverse=True)
+        job_ids = job_ids[:max_jobs]
+        jobs = {job_id: jobs[job_id] for job_id in job_ids}
+
     columns = [
         'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
         'JOB DURATION', '#RECOVERIES', 'STATUS'
@@ -552,9 +608,8 @@ def format_job_table(
             submitted_at = None
             end_at: Optional[int] = 0
             recovery_cnt = 0
-            spot_status = spot_state.SpotStatus.SUCCEEDED
-            failure_reason = None
-            current_task_id = len(job_tasks) - 1
+            managed_job_status, current_task_id = _get_job_status_from_tasks(
+                job_tasks)
             for task in job_tasks:
                 job_duration += task['job_duration']
                 if task['submitted_at'] is not None:
@@ -567,19 +622,8 @@ def format_job_table(
                 else:
                     end_at = None
                 recovery_cnt += task['recovery_count']
-                if spot_status == spot_state.SpotStatus.SUCCEEDED:
-                    # Use the first non-succeeded status.
-                    # TODO(zhwu): we should not blindly use the first non-
-                    # succeeded as the status could be changed to SUBMITTED
-                    # when going from one task to the next one, which can be
-                    # confusing.
-                    spot_status = task['status']
-                    current_task_id = task['task_id']
-
-                if (failure_reason is None and
-                        task['status'] > spot_state.SpotStatus.SUCCEEDED):
-                    failure_reason = task['failure_reason']
 
+            failure_reason = job_tasks[current_task_id]['failure_reason']
             job_duration = log_utils.readable_time_duration(0,
                                                             job_duration,
                                                             absolute=True)
@@ -588,9 +632,8 @@ def format_job_table(
                                                               end_at,
                                                               absolute=True)
 
-            status_str = spot_status.colored_str()
-            if (spot_status < spot_state.SpotStatus.RUNNING and
-                    current_task_id > 0):
+            status_str = managed_job_status.colored_str()
+            if not managed_job_status.is_terminal():
                 status_str += f' (task: {current_task_id})'
 
             job_values = [
@@ -615,7 +658,7 @@ def format_job_table(
 
         for task in job_tasks:
             # The job['job_duration'] is already calculated in
-            # dump_spot_job_queue().
+            # dump_managed_job_queue().
             job_duration = log_utils.readable_time_duration(
                 0, task['job_duration'], absolute=True)
             submitted = log_utils.readable_time_duration(task['submitted_at'])
@@ -654,7 +697,7 @@ def format_job_table(
     if status_str:
         status_str = f'In progress tasks: {status_str}'
     else:
-        status_str = 'No in-progress spot jobs.'
+        status_str = 'No in-progress managed jobs.'
     output = status_str
     if str(job_table):
         output += f'\n{job_table}'
@@ -663,105 +706,85 @@ def format_job_table(
     return output
 
 
-class SpotCodeGen:
-    """Code generator for managed spot job utility functions.
+class ManagedJobCodeGen:
+    """Code generator for managed job utility functions.
 
     Usage:
 
-      >> codegen = SpotCodegen.show_jobs(...)
+      >> codegen = ManagedJobCodeGen.show_jobs(...)
     """
-    _PREFIX = [
-        'from sky.spot import spot_state',
-        'from sky.spot import spot_utils',
-    ]
+    _PREFIX = textwrap.dedent("""\
+        managed_job_version = 0
+        try:
+            from sky.jobs import constants, state, utils
+            managed_job_version = constants.MANAGED_JOBS_VERSION
+        except ImportError:
+            from sky.spot import spot_state as state, spot_utils as utils
+        """)
 
     @classmethod
     def get_job_table(cls) -> str:
-        code = [
-            'job_table = spot_utils.dump_spot_job_queue()',
-            'print(job_table, flush=True)',
-        ]
+        code = textwrap.dedent("""\
+        if managed_job_version < 1:
+            job_table = utils.dump_spot_job_queue()
+        else:
+            job_table = utils.dump_managed_job_queue()
+        print(job_table, flush=True)
+        """)
         return cls._build(code)
 
     @classmethod
     def cancel_jobs_by_id(cls, job_ids: Optional[List[int]]) -> str:
-        code = [
-            f'msg = spot_utils.cancel_jobs_by_id({job_ids})',
-            'print(msg, end="", flush=True)',
-        ]
+        code = textwrap.dedent(f"""\
+        msg = utils.cancel_jobs_by_id({job_ids})
+        print(msg, end="", flush=True)
+        """)
         return cls._build(code)
 
     @classmethod
     def cancel_job_by_name(cls, job_name: str) -> str:
-        code = [
-            f'msg = spot_utils.cancel_job_by_name({job_name!r})',
-            'print(msg, end="", flush=True)',
-        ]
+        code = textwrap.dedent(f"""\
+        msg = utils.cancel_job_by_name({job_name!r})
+        print(msg, end="", flush=True)
+        """)
         return cls._build(code)
 
     @classmethod
     def stream_logs_by_name(cls, job_name: str, follow: bool = True) -> str:
-        code = [
-            f'msg = spot_utils.stream_logs_by_name({job_name!r}, '
-            f'follow={follow})',
-            'print(msg, flush=True)',
-        ]
+        code = textwrap.dedent(f"""\
+        msg = utils.stream_logs_by_name({job_name!r}, follow={follow})
+        print(msg, flush=True)
+        """)
         return cls._build(code)
 
     @classmethod
     def stream_logs_by_id(cls,
                           job_id: Optional[int],
                           follow: bool = True) -> str:
-        code = [
-            f'job_id = {job_id} if {job_id} is not None '
-            'else spot_state.get_latest_job_id()',
-            f'msg = spot_utils.stream_logs_by_id(job_id, follow={follow})',
-            'print(msg, flush=True)',
-        ]
+        code = textwrap.dedent(f"""\
+        job_id = {job_id} if {job_id} is not None else state.get_latest_job_id()
+        msg = utils.stream_logs_by_id(job_id, follow={follow})
+        print(msg, flush=True)
+        """)
         return cls._build(code)
 
     @classmethod
-    def set_pending(cls, job_id: int, spot_dag: 'dag_lib.Dag') -> str:
-        dag_name = spot_dag.name
-        # Add the spot job to spot queue table.
-        code = [
-            f'spot_state.set_job_name('
-            f'{job_id}, {dag_name!r})',
-        ]
-        for task_id, task in enumerate(spot_dag.tasks):
-            resources_str = backend_utils.get_task_resources_str(task)
-            code += [
-                f'spot_state.set_pending('
-                f'{job_id}, {task_id}, {task.name!r}, '
-                f'{resources_str!r})',
-            ]
+    def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag') -> str:
+        dag_name = managed_job_dag.name
+        # Add the managed job to queue table.
+        code = textwrap.dedent(f"""\
+            state.set_job_name({job_id}, {dag_name!r})
+            """)
+        for task_id, task in enumerate(managed_job_dag.tasks):
+            resources_str = backend_utils.get_task_resources_str(
+                task, is_managed_job=True)
+            code += textwrap.dedent(f"""\
+                state.set_pending({job_id}, {task_id}, 
+                                  {task.name!r}, {resources_str!r})
+                """)
         return cls._build(code)
 
     @classmethod
-    def _build(cls, code: List[str]) -> str:
-        code = cls._PREFIX + code
-        generated_code = '; '.join(code)
+    def _build(cls, code: str) -> str:
+        generated_code = cls._PREFIX + '\n' + code
         return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(generated_code)}'
-
-
-def dump_job_table_cache(job_table: str):
-    """Dump job table cache to file."""
-    cache_file = pathlib.Path(_SPOT_STATUS_CACHE).expanduser()
-    with cache_file.open('w', encoding='utf-8') as f:
-        json.dump((time.time(), job_table), f)
-
-
-def load_job_table_cache() -> Optional[Tuple[float, str]]:
-    """Load job table cache from file.
-
-    Returns:
-        A tuple of (timestamp, job_table), where the timestamp is
-        the time when the job table is dumped and the job_table is
-        the dumped job table in string.
-        None if the cache file does not exist.
-    """
-    cache_file = pathlib.Path(_SPOT_STATUS_CACHE).expanduser()
-    if not cache_file.exists():
-        return None
-    with cache_file.open('r', encoding='utf-8') as f:
-        return json.load(f)
diff --git a/sky/optimizer.py b/sky/optimizer.py
index 36a591f2436..1cb9bc0890c 100644
--- a/sky/optimizer.py
+++ b/sky/optimizer.py
@@ -809,10 +809,12 @@ def _get_resource_group_hash(resources: 'resources_lib.Resources'):
             'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
             'REGION/ZONE'
         ]
-        # Do not print Source or Sink.
-        best_plan_rows = [[t, t.num_nodes] + _get_resources_element_list(r)
-                          for t, r in ordered_best_plan.items()]
-        if len(best_plan_rows) > 1:
+        if len(ordered_best_plan) > 1:
+            best_plan_rows = []
+            for t, r in ordered_best_plan.items():
+                assert t.name is not None, t
+                best_plan_rows.append([t.name, str(t.num_nodes)] +
+                                      _get_resources_element_list(r))
             logger.info(
                 f'{colorama.Style.BRIGHT}Best plan: {colorama.Style.RESET_ALL}')
             best_plan_table = _create_table(['TASK', '#NODES'] +
@@ -835,7 +837,7 @@ def _get_resource_group_hash(resources: 'resources_lib.Resources'):
                 json.dumps(resource.to_yaml_config()): cost
                 for resource, cost in v.items()
             }
-            task_str = (f'for task {repr(task)!r} ' if num_tasks > 1 else '')
+            task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
             plural = 's' if task.num_nodes > 1 else ''
             logger.info(
                 f'{colorama.Style.BRIGHT}Considered resources {task_str}'
diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py
index e5a8aa78c72..b9fdf80326d 100644
--- a/sky/provision/aws/instance.py
+++ b/sky/provision/aws/instance.py
@@ -61,17 +61,17 @@
 def _default_ec2_resource(region: str) -> Any:
     if not hasattr(aws, 'version'):
         # For backward compatibility, reload the module if the aws module was
-        # imported before and stale. Used for, e.g., a live spot controller
+        # imported before and stale. Used for, e.g., a live jobs controller
         # running an older version and a new version gets installed by
-        # `sky spot launch`.
+        # `sky jobs launch`.
         #
         # Detailed explanation follows. Assume we're in this situation: an old
-        # spot controller running a spot job and then the code gets updated on
-        # the controller due to a new `sky spot launch` or `sky start`.
+        # jobs controller running a managed job and then the code gets updated
+        # on the controller due to a new `sky jobs launch or `sky start`.
         #
-        # First, controller consists of an outer process (sky.spot.controller's
+        # First, controller consists of an outer process (sky.jobs.controller's
         # main) and an inner process running the controller logic (started as a
-        # multiprocessing.Process in sky.spot.controller). `sky.provision.aws`
+        # multiprocessing.Process in sky.jobs.controller). `sky.provision.aws`
         # is only imported in the inner process due to its load-on-use
         # semantics.
         #
@@ -79,8 +79,8 @@ def _default_ec2_resource(region: str) -> Any:
         # {old sky.provision.aws, old sky.adaptors.aws}, and outer process has
         # loaded {old sky.adaptors.aws}.
         #
-        # In controller.py's start(), the inner process may exit due to spot job
-        # exits or `sky spot cancel`, entering outer process'
+        # In controller.py's start(), the inner process may exit due to managed
+        # job exits or `sky jobs cancel`, entering outer process'
         # `finally: ... _cleanup()` path. Inside _cleanup(), we eventually call
         # into `sky.provision.aws` which loads this module for the first time
         # for the outer process. At this point, outer process has loaded
diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py
index 8c86ca7f0ba..764d197493a 100644
--- a/sky/provision/provisioner.py
+++ b/sky/provision/provisioner.py
@@ -482,7 +482,7 @@ def _post_provision_setup(
 
         # We mount the metadata with sky wheel for speedup.
         # NOTE: currently we mount all credentials for all nodes, because
-        # (1) spot controllers need permission to launch/down nodes of
+        # (1) jobs controllers need permission to launch/down nodes of
         #     multiple clouds
         # (2) head instances need permission for auto stop or auto down
         #     nodes for the current cloud
diff --git a/sky/resources.py b/sky/resources.py
index a6d99fa2a8b..252edff5da6 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -9,9 +9,9 @@
 from sky import check as sky_check
 from sky import clouds
 from sky import exceptions
+from sky import jobs as managed_jobs
 from sky import sky_logging
 from sky import skypilot_config
-from sky import spot
 from sky.clouds import service_catalog
 from sky.provision import docker_utils
 from sky.skylet import constants
@@ -44,7 +44,7 @@ class Resources:
     """
     # If any fields changed, increment the version. For backward compatibility,
     # modify the __setstate__ method to handle the old version.
-    _VERSION = 17
+    _VERSION = 18
 
     def __init__(
         self,
@@ -55,7 +55,7 @@ def __init__(
         accelerators: Union[None, str, Dict[str, int]] = None,
         accelerator_args: Optional[Dict[str, str]] = None,
         use_spot: Optional[bool] = None,
-        spot_recovery: Optional[str] = None,
+        job_recovery: Optional[str] = None,
         region: Optional[str] = None,
         zone: Optional[str] = None,
         image_id: Union[Dict[str, str], str, None] = None,
@@ -106,9 +106,9 @@ def __init__(
             ``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
           use_spot: whether to use spot instances. If None, defaults to
             False.
-          spot_recovery: the spot recovery strategy to use for the managed
-            spot to recover the cluster from preemption. Refer to
-            `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/spot/recovery_strategy.py>`__ # pylint: disable=line-too-long
+          job_recovery: the job recovery strategy to use for the managed
+            job to recover the cluster from preemption. Refer to
+            `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
             for more details.
           region: the region to use.
           zone: the zone to use.
@@ -160,10 +160,10 @@ def __init__(
 
         self._use_spot_specified = use_spot is not None
         self._use_spot = use_spot if use_spot is not None else False
-        self._spot_recovery = None
-        if spot_recovery is not None:
-            if spot_recovery.strip().lower() != 'none':
-                self._spot_recovery = spot_recovery.upper()
+        self._job_recovery = None
+        if job_recovery is not None:
+            if job_recovery.strip().lower() != 'none':
+                self._job_recovery = job_recovery.upper()
 
         if disk_size is not None:
             if round(disk_size) != disk_size:
@@ -224,7 +224,7 @@ def __init__(
 
         self._try_validate_instance_type()
         self._try_validate_cpus_mem()
-        self._try_validate_spot()
+        self._try_validate_managed_job_attributes()
         self._try_validate_image_id()
         self._try_validate_disk_tier()
         self._try_validate_ports()
@@ -268,8 +268,8 @@ def __repr__(self) -> str:
                 accelerator_args = f', accelerator_args={self.accelerator_args}'
 
         cpus = ''
-        if self.cpus is not None:
-            cpus = f', cpus={self.cpus}'
+        if self._cpus is not None:
+            cpus = f', cpus={self._cpus}'
 
         memory = ''
         if self.memory is not None:
@@ -353,6 +353,7 @@ def instance_type(self):
         return self._instance_type
 
     @property
+    @functools.lru_cache(maxsize=1)
     def cpus(self) -> Optional[str]:
         """Returns the number of vCPUs that each instance must have.
 
@@ -363,7 +364,13 @@ def cpus(self) -> Optional[str]:
         at launch time. Thus, Resources in the backend's ResourceHandle will
         always have the cpus field set to None.)
         """
-        return self._cpus
+        if self._cpus is not None:
+            return self._cpus
+        if self.cloud is not None and self._instance_type is not None:
+            vcpus, _ = self.cloud.get_vcpus_mem_from_instance_type(
+                self._instance_type)
+            return str(vcpus)
+        return None
 
     @property
     def memory(self) -> Optional[str]:
@@ -408,8 +415,8 @@ def use_spot_specified(self) -> bool:
         return self._use_spot_specified
 
     @property
-    def spot_recovery(self) -> Optional[str]:
-        return self._spot_recovery
+    def job_recovery(self) -> Optional[str]:
+        return self._job_recovery
 
     @property
     def disk_size(self) -> int:
@@ -487,7 +494,7 @@ def _set_memory(
         if isinstance(memory, str):
             if memory.endswith(('+', 'x')):
                 # 'x' is used internally for make sure our resources used by
-                # spot controller (memory: 3x) to have enough memory based on
+                # jobs controller (memory: 3x) to have enough memory based on
                 # the vCPUs.
                 num_memory_gb = memory[:-1]
             else:
@@ -576,10 +583,10 @@ def _set_accelerators(
     def is_launchable(self) -> bool:
         return self.cloud is not None and self._instance_type is not None
 
-    def need_cleanup_after_preemption(self) -> bool:
-        """Returns whether a spot resource needs cleanup after preemption."""
+    def need_cleanup_after_preemption_or_failure(self) -> bool:
+        """Whether a resource needs cleanup after preemption or failure."""
         assert self.is_launchable(), self
-        return self.cloud.need_cleanup_after_preemption(self)
+        return self.cloud.need_cleanup_after_preemption_or_failure(self)
 
     def _validate_and_set_region_zone(self, region: Optional[str],
                                       zone: Optional[str]) -> None:
@@ -745,30 +752,30 @@ def _try_validate_cpus_mem(self) -> None:
         Raises:
             ValueError: if the attributes are invalid.
         """
-        if self.cpus is None and self.memory is None:
+        if self._cpus is None and self._memory is None:
             return
-        if self.instance_type is not None:
+        if self._instance_type is not None:
             # The assertion should be true because we have already executed
             # _try_validate_instance_type() before this method.
             # The _try_validate_instance_type() method infers and sets
             # self.cloud if self.instance_type is not None.
             assert self.cloud is not None
             cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
-                self.instance_type)
-            if self.cpus is not None:
-                if self.cpus.endswith('+'):
-                    if cpus < float(self.cpus[:-1]):
+                self._instance_type)
+            if self._cpus is not None:
+                if self._cpus.endswith('+'):
+                    if cpus < float(self._cpus[:-1]):
                         with ux_utils.print_exception_no_traceback():
                             raise ValueError(
                                 f'{self.instance_type} does not have enough '
                                 f'vCPUs. {self.instance_type} has {cpus} '
-                                f'vCPUs, but {self.cpus} is requested.')
-                elif cpus != float(self.cpus):
+                                f'vCPUs, but {self._cpus} is requested.')
+                elif cpus != float(self._cpus):
                     with ux_utils.print_exception_no_traceback():
                         raise ValueError(
                             f'{self.instance_type} does not have the requested '
                             f'number of vCPUs. {self.instance_type} has {cpus} '
-                            f'vCPUs, but {self.cpus} is requested.')
+                            f'vCPUs, but {self._cpus} is requested.')
             if self.memory is not None:
                 if self.memory.endswith(('+', 'x')):
                     if mem < float(self.memory[:-1]):
@@ -784,25 +791,20 @@ def _try_validate_cpus_mem(self) -> None:
                             f'memory. {self.instance_type} has {mem} GB '
                             f'memory, but {self.memory} is requested.')
 
-    def _try_validate_spot(self) -> None:
-        """Try to validate the spot related attributes.
+    def _try_validate_managed_job_attributes(self) -> None:
+        """Try to validate managed job related attributes.
 
         Raises:
             ValueError: if the attributes are invalid.
         """
-        if self._spot_recovery is None:
+        if self._job_recovery is None:
             return
-        if not self._use_spot:
-            with ux_utils.print_exception_no_traceback():
-                raise ValueError(
-                    'Cannot specify spot_recovery without use_spot set to True.'
-                )
-        if self._spot_recovery not in spot.SPOT_STRATEGIES:
+        if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'Spot recovery strategy {self._spot_recovery} '
+                    f'Spot recovery strategy {self._job_recovery} '
                     'is not supported. The strategy should be among '
-                    f'{list(spot.SPOT_STRATEGIES.keys())}')
+                    f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
 
     def extract_docker_image(self) -> Optional[str]:
         if self.image_id is None:
@@ -1169,7 +1171,7 @@ def is_empty(self) -> bool:
         return all([
             self.cloud is None,
             self._instance_type is None,
-            self.cpus is None,
+            self._cpus is None,
             self.memory is None,
             self.accelerators is None,
             self.accelerator_args is None,
@@ -1187,13 +1189,13 @@ def copy(self, **override) -> 'Resources':
         resources = Resources(
             cloud=override.pop('cloud', self.cloud),
             instance_type=override.pop('instance_type', self.instance_type),
-            cpus=override.pop('cpus', self.cpus),
+            cpus=override.pop('cpus', self._cpus),
             memory=override.pop('memory', self.memory),
             accelerators=override.pop('accelerators', self.accelerators),
             accelerator_args=override.pop('accelerator_args',
                                           self.accelerator_args),
             use_spot=override.pop('use_spot', use_spot),
-            spot_recovery=override.pop('spot_recovery', self.spot_recovery),
+            job_recovery=override.pop('job_recovery', self.job_recovery),
             disk_size=override.pop('disk_size', self.disk_size),
             region=override.pop('region', self.region),
             zone=override.pop('zone', self.zone),
@@ -1344,7 +1346,15 @@ def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
         resources_fields['accelerator_args'] = config.pop(
             'accelerator_args', None)
         resources_fields['use_spot'] = config.pop('use_spot', None)
-        resources_fields['spot_recovery'] = config.pop('spot_recovery', None)
+        if config.get('spot_recovery') is not None:
+            logger.warning('spot_recovery is deprecated. Use job_recovery '
+                           'instead (the system is defaulting to that for '
+                           'you).')
+            resources_fields['job_recovery'] = config.pop('spot_recovery', None)
+        else:
+            # spot_recovery and job_recovery are guaranteed to be mutually
+            # exclusive by the schema validation.
+            resources_fields['job_recovery'] = config.pop('job_recovery', None)
         resources_fields['disk_size'] = config.pop('disk_size', None)
         resources_fields['region'] = config.pop('region', None)
         resources_fields['zone'] = config.pop('zone', None)
@@ -1381,14 +1391,14 @@ def add_if_not_none(key, value):
 
         add_if_not_none('cloud', str(self.cloud))
         add_if_not_none('instance_type', self.instance_type)
-        add_if_not_none('cpus', self.cpus)
+        add_if_not_none('cpus', self._cpus)
         add_if_not_none('memory', self.memory)
         add_if_not_none('accelerators', self.accelerators)
         add_if_not_none('accelerator_args', self.accelerator_args)
 
         if self._use_spot_specified:
             add_if_not_none('use_spot', self.use_spot)
-            add_if_not_none('spot_recovery', self.spot_recovery)
+        add_if_not_none('job_recovery', self.job_recovery)
         add_if_not_none('disk_size', self.disk_size)
         add_if_not_none('region', self.region)
         add_if_not_none('zone', self.zone)
@@ -1435,6 +1445,8 @@ def __setstate__(self, state):
         if version < 2:
             self._region = None
 
+        # spot_recovery is deprecated. We keep the history just for readability,
+        # it should be removed by chunk in the future.
         if version < 3:
             self._spot_recovery = None
 
@@ -1510,4 +1522,7 @@ def __setstate__(self, state):
         if version < 17:
             state['_labels'] = state.get('_labels', None)
 
+        if version < 18:
+            self._job_recovery = state.pop('_spot_recovery', None)
+
         self.__dict__.update(state)
diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py
index d533df382cc..0a6b84111c6 100644
--- a/sky/serve/autoscalers.py
+++ b/sky/serve/autoscalers.py
@@ -514,7 +514,7 @@ def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
                                           if spec.dynamic_ondemand_fallback
                                           is not None else False)
 
-    # spot_recovery field is checked earlier in core
+    # job_recovery field is checked earlier in core
     def _get_spot_resources_override_dict(self) -> Dict[str, Any]:
         return {'use_spot': True}
 
diff --git a/sky/serve/constants.py b/sky/serve/constants.py
index d6cea6154d3..17d52b2f167 100644
--- a/sky/serve/constants.py
+++ b/sky/serve/constants.py
@@ -53,9 +53,9 @@
 # do some log rotation.
 CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
 
-# Due to the CPU/memory usage of the controller process launched with sky job (
-# use ray job under the hood), we need to reserve some CPU/memory for each serve
-# controller process.
+# Due to the CPU/memory usage of the controller process launched with a job on
+# controller VM (use ray job under the hood), we need to reserve some CPU/memory
+# for each serve controller process.
 # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
 # services.
 CONTROLLER_MEMORY_USAGE_GB = 1.0
diff --git a/sky/serve/core.py b/sky/serve/core.py
index 79d934f1769..086fca02984 100644
--- a/sky/serve/core.py
+++ b/sky/serve/core.py
@@ -57,9 +57,9 @@ def _validate_service_task(task: 'sky.Task') -> None:
     policy_description = ('on-demand'
                           if task.service.dynamic_ondemand_fallback else 'spot')
     for resource in list(task.resources):
-        if resource.spot_recovery is not None:
+        if resource.job_recovery is not None:
             with ux_utils.print_exception_no_traceback():
-                raise ValueError('spot_recovery is disabled for SkyServe. '
+                raise ValueError('job_recovery is disabled for SkyServe. '
                                  'SkyServe will replenish preempted spot '
                                  f'with {policy_description} instances.')
 
@@ -139,7 +139,8 @@ def up(
         controller_log_file = (
             serve_utils.generate_remote_controller_log_file_name(service_name))
         controller_resources = controller_utils.get_controller_resources(
-            controller_type='serve', task_resources=task.resources)
+            controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
+            task_resources=task.resources)
 
         vars_to_fill = {
             'remote_task_yaml_path': remote_tmp_task_yaml_path,
@@ -150,7 +151,7 @@ def up(
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),
             **controller_utils.shared_controller_vars_to_fill(
-                'serve',
+                controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
                 remote_user_config_path=remote_config_yaml_path,
             ),
         }
@@ -304,7 +305,7 @@ def update(
     """
     _validate_service_task(task)
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
+        controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
         stopped_message=
         'Service controller is stopped. There is no service to update. '
         f'To spin up a new service, use {backend_utils.BOLD}'
@@ -448,7 +449,7 @@ def down(
     if isinstance(service_names, str):
         service_names = [service_names]
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
+        controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
         stopped_message='All services should have terminated.')
 
     service_names_str = ','.join(service_names)
@@ -555,7 +556,7 @@ def status(
 
     controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_type,
+        controller=controller_type,
         stopped_message=controller_type.value.default_hint_if_non_existent)
 
     backend = backend_utils.get_backend_from_handle(handle)
@@ -639,7 +640,7 @@ def tail_logs(
                 raise ValueError('`replica_id` must be None when using '
                                  'target=CONTROLLER/LOAD_BALANCER.')
     handle = backend_utils.is_controller_accessible(
-        controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
+        controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
         stopped_message=(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
                          value.default_hint_if_non_existent))
 
diff --git a/sky/setup_files/MANIFEST.in b/sky/setup_files/MANIFEST.in
index 170dc59e7de..ad0163a2e22 100644
--- a/sky/setup_files/MANIFEST.in
+++ b/sky/setup_files/MANIFEST.in
@@ -13,8 +13,8 @@ include sky/skylet/providers/oci/*
 include sky/skylet/providers/scp/*
 include sky/skylet/providers/*.py
 include sky/skylet/ray_patches/*.patch
-include sky/spot/dashboard/*
-include sky/spot/dashboard/templates/*
-include sky/spot/dashboard/static/*
+include sky/jobs/dashboard/*
+include sky/jobs/dashboard/templates/*
+include sky/jobs/dashboard/static/*
 include sky/templates/*
 include sky/utils/kubernetes/*
diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py
index 63ba580448d..adde7d6ab84 100644
--- a/sky/setup_files/setup.py
+++ b/sky/setup_files/setup.py
@@ -190,7 +190,7 @@ def parse_readme(readme: str) -> str:
     'pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3',
 ]
 
-# NOTE: Change the templates/spot-controller.yaml.j2 file if any of the
+# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
 # following packages dependencies are changed.
 aws_dependencies = [
     # botocore does not work with urllib3>=2.0.0, according to https://github.com/boto/botocore/issues/2926
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
index 4db25755a9d..578629ea3e2 100644
--- a/sky/skylet/constants.py
+++ b/sky/skylet/constants.py
@@ -42,11 +42,11 @@
 
 # The name for the environment variable that stores the unique ID of the
 # current task. This will stay the same across multiple recoveries of the
-# same spot task.
+# same managed task.
 TASK_ID_ENV_VAR = 'SKYPILOT_TASK_ID'
 # This environment variable stores a '\n'-separated list of task IDs that
-# are within the same spot job (DAG). This can be used by the user to
-# retrieve the task IDs of any tasks that are within the same spot job.
+# are within the same managed job (DAG). This can be used by the user to
+# retrieve the task IDs of any tasks that are within the same managed job.
 # This environment variable is pre-assigned before any task starts
 # running within the same job, and will remain constant throughout the
 # lifetime of the job.
@@ -66,9 +66,9 @@
 SKYLET_LIB_VERSION = 1
 SKYLET_VERSION_FILE = '~/.sky/skylet_version'
 
-# `sky spot dashboard`-related
+# `sky jobs dashboard`-related
 #
-# Port on the remote spot controller that the dashboard is running on.
+# Port on the remote jobs controller that the dashboard is running on.
 SPOT_DASHBOARD_REMOTE_PORT = 5000
 
 # Docker default options
@@ -160,7 +160,7 @@
 
 # The name for the environment variable that stores SkyPilot user hash, which
 # is mainly used to make sure sky commands runs on a VM launched by SkyPilot
-# will be recognized as the same user (e.g., spot controller or sky serve
+# will be recognized as the same user (e.g., jobs controller or sky serve
 # controller).
 USER_ID_ENV_VAR = 'SKYPILOT_USER_ID'
 
@@ -187,11 +187,11 @@
 # TODO(tian): Refactor to controller_utils. Current blocker: circular import.
 CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
 
-# Due to the CPU/memory usage of the controller process launched with sky job (
-# use ray job under the hood), we need to reserve some CPU/memory for each spot/
+# Due to the CPU/memory usage of the controller process launched with sky jobs (
+# use ray job under the hood), we need to reserve some CPU/memory for each jobs/
 # serve controller process.
-# Spot: A default controller with 8 vCPU and 32 GB memory can manage up to 32
-# spot jobs.
+# Jobs: A default controller with 8 vCPU and 32 GB memory can manage up to 32
+# managed jobs.
 # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
 # services.
 CONTROLLER_PROCESS_CPU_DEMAND = 0.25
diff --git a/sky/skylet/events.py b/sky/skylet/events.py
index 2be8daf87e8..22e86778570 100644
--- a/sky/skylet/events.py
+++ b/sky/skylet/events.py
@@ -14,10 +14,10 @@
 from sky import sky_logging
 from sky.backends import cloud_vm_ray_backend
 from sky.clouds import cloud_registry
+from sky.jobs import utils as managed_job_utils
 from sky.serve import serve_utils
 from sky.skylet import autostop_lib
 from sky.skylet import job_lib
-from sky.spot import spot_utils
 from sky.utils import cluster_yaml_utils
 from sky.utils import common_utils
 from sky.utils import ux_utils
@@ -67,12 +67,12 @@ def _run(self):
         job_lib.scheduler.schedule_step(force_update_jobs=True)
 
 
-class SpotJobUpdateEvent(SkyletEvent):
-    """Skylet event for updating spot job status."""
+class ManagedJobUpdateEvent(SkyletEvent):
+    """Skylet event for updating managed job status."""
     EVENT_INTERVAL_SECONDS = 300
 
     def _run(self):
-        spot_utils.update_spot_job_status()
+        managed_job_utils.update_managed_job_status()
 
 
 class ServiceUpdateEvent(SkyletEvent):
diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py
index 615c454ca9f..ceed5a26024 100644
--- a/sky/skylet/job_lib.py
+++ b/sky/skylet/job_lib.py
@@ -1,4 +1,4 @@
-"""Sky job lib, backed by a sqlite database.
+"""Utilities for jobs on a remote cluster, backed by a sqlite database.
 
 This is a remote utility module that provides job queue functionality.
 """
@@ -392,15 +392,14 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
                                                  get_ended_time: bool) -> str:
     """Get the job submitted/ended timestamp.
 
-    This function should only be called by the spot controller,
-    which is ok to use `submitted_at` instead of `start_at`,
-    because the spot job duration need to include both setup
-    and running time and the job will not stay in PENDING
-    state.
+    This function should only be called by the jobs controller, which is ok to
+    use `submitted_at` instead of `start_at`, because the managed job duration
+    need to include both setup and running time and the job will not stay in
+    PENDING state.
 
-    The normal job duration will use `start_at` instead of
-    `submitted_at` (in `format_job_queue()`), because the job
-    may stay in PENDING if the cluster is busy.
+    The normal job duration will use `start_at` instead of `submitted_at` (in
+    `format_job_queue()`), because the job may stay in PENDING if the cluster is
+    busy.
     """
     field = 'end_at' if get_ended_time else 'submitted_at'
     rows = _CURSOR.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
@@ -880,7 +879,7 @@ def fail_all_jobs_in_progress(cls) -> str:
     @classmethod
     def tail_logs(cls,
                   job_id: Optional[int],
-                  spot_job_id: Optional[int],
+                  managed_job_id: Optional[int],
                   follow: bool = True) -> str:
         # pylint: disable=line-too-long
         code = [
@@ -888,7 +887,7 @@ def tail_logs(cls,
             'run_timestamp = job_lib.get_run_timestamp(job_id)',
             f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
             f'log_lib.tail_logs(job_id=job_id, log_dir=log_dir, '
-            f'spot_job_id={spot_job_id!r}, follow={follow}, **job_owner_kwargs)',
+            f'managed_job_id={managed_job_id!r}, follow={follow}, **job_owner_kwargs)',
         ]
         return cls._build(code)
 
diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py
index e8b4de8b7fa..44c44afc772 100644
--- a/sky/skylet/log_lib.py
+++ b/sky/skylet/log_lib.py
@@ -379,14 +379,15 @@ def _follow_job_logs(file,
 
 def tail_logs(job_id: Optional[int],
               log_dir: Optional[str],
-              spot_job_id: Optional[int] = None,
+              managed_job_id: Optional[int] = None,
               follow: bool = True) -> None:
     """Tail the logs of a job.
 
     Args:
         job_id: The job id.
         log_dir: The log directory of the job.
-        spot_job_id: The spot job id (for logging info only to avoid confusion).
+        managed_job_id: The managed job id (for logging info only to avoid
+            confusion).
         follow: Whether to follow the logs or print the logs so far and exit.
     """
     if job_id is None:
@@ -396,14 +397,14 @@ def tail_logs(job_id: Optional[int],
         logger.info('Skip streaming logs as no job has been submitted.')
         return
     job_str = f'job {job_id}'
-    if spot_job_id is not None:
-        job_str = f'spot job {spot_job_id}'
+    if managed_job_id is not None:
+        job_str = f'managed job {managed_job_id}'
     if log_dir is None:
         print(f'{job_str.capitalize()} not found (see `sky queue`).',
               file=sys.stderr)
         return
-    logger.debug(f'Tailing logs for job, real job_id {job_id}, spot_job_id '
-                 f'{spot_job_id}.')
+    logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
+                 f'{managed_job_id}.')
     logger.info(f'{colorama.Fore.YELLOW}Start streaming logs for {job_str}.'
                 f'{colorama.Style.RESET_ALL}')
     log_path = os.path.join(log_dir, 'run.log')
diff --git a/sky/skylet/log_lib.pyi b/sky/skylet/log_lib.pyi
index 6815905a461..01b08b6444f 100644
--- a/sky/skylet/log_lib.pyi
+++ b/sky/skylet/log_lib.pyi
@@ -120,6 +120,6 @@ def run_bash_command_with_log(bash_command: str,
 
 def tail_logs(job_id: int,
               log_dir: Optional[str],
-              spot_job_id: Optional[int] = ...,
+              managed_job_id: Optional[int] = ...,
               follow: bool = ...) -> None:
     ...
diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py
index 509fd83c788..a114d622de4 100644
--- a/sky/skylet/skylet.py
+++ b/sky/skylet/skylet.py
@@ -17,10 +17,10 @@
 EVENTS = [
     events.AutostopEvent(),
     events.JobSchedulerEvent(),
-    # The spot job update event should be after the job update event.
-    # Otherwise, the abnormal spot job status update will be delayed
+    # The managed job update event should be after the job update event.
+    # Otherwise, the abnormal managed job status update will be delayed
     # until the next job update event.
-    events.SpotJobUpdateEvent(),
+    events.ManagedJobUpdateEvent(),
     # This is for monitoring controller job status. If it becomes
     # unhealthy, this event will correctly update the controller
     # status to CONTROLLER_FAILED.
diff --git a/sky/skypilot_config.py b/sky/skypilot_config.py
index 0d950bf6e98..10fc90fa850 100644
--- a/sky/skypilot_config.py
+++ b/sky/skypilot_config.py
@@ -63,7 +63,7 @@
 # 2 in the list.
 
 # (Used internally) An env var holding the path to the local config file. This
-# is only used by spot controller tasks to ensure recoveries of the same job
+# is only used by jobs controller tasks to ensure recoveries of the same job
 # use the same config file.
 ENV_VAR_SKYPILOT_CONFIG = 'SKYPILOT_CONFIG'
 
diff --git a/sky/spot/__init__.py b/sky/spot/__init__.py
deleted file mode 100644
index e25f25c9176..00000000000
--- a/sky/spot/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Modules for managed spot clusters."""
-import pathlib
-
-from sky.spot.constants import SPOT_CLUSTER_NAME_PREFIX_LENGTH
-from sky.spot.constants import SPOT_CONTROLLER_TEMPLATE
-from sky.spot.constants import SPOT_CONTROLLER_YAML_PREFIX
-from sky.spot.constants import SPOT_TASK_YAML_PREFIX
-from sky.spot.core import cancel
-from sky.spot.core import launch
-from sky.spot.core import queue
-from sky.spot.core import tail_logs
-from sky.spot.recovery_strategy import SPOT_DEFAULT_STRATEGY
-from sky.spot.recovery_strategy import SPOT_STRATEGIES
-from sky.spot.spot_state import SpotStatus
-from sky.spot.spot_utils import dump_job_table_cache
-from sky.spot.spot_utils import dump_spot_job_queue
-from sky.spot.spot_utils import format_job_table
-from sky.spot.spot_utils import load_job_table_cache
-from sky.spot.spot_utils import load_spot_job_queue
-from sky.spot.spot_utils import SPOT_CONTROLLER_NAME
-from sky.spot.spot_utils import SpotCodeGen
-
-pathlib.Path(SPOT_TASK_YAML_PREFIX).expanduser().parent.mkdir(parents=True,
-                                                              exist_ok=True)
-__all__ = [
-    'SPOT_STRATEGIES',
-    'SPOT_DEFAULT_STRATEGY',
-    'SPOT_CONTROLLER_NAME',
-    # Constants
-    'SPOT_CONTROLLER_TEMPLATE',
-    'SPOT_CONTROLLER_YAML_PREFIX',
-    'SPOT_TASK_YAML_PREFIX',
-    # Enums
-    'SpotStatus',
-    # Core
-    'cancel',
-    'launch',
-    'queue',
-    'tail_logs',
-    # utils
-    'SpotCodeGen',
-    'dump_job_table_cache',
-    'load_job_table_cache',
-    'format_job_table',
-    'dump_spot_job_queue',
-    'load_spot_job_queue',
-]
diff --git a/sky/spot/constants.py b/sky/spot/constants.py
deleted file mode 100644
index 65a99a6ce14..00000000000
--- a/sky/spot/constants.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""Constants used for Managed Spot."""
-
-SPOT_CONTROLLER_TEMPLATE = 'spot-controller.yaml.j2'
-SPOT_CONTROLLER_YAML_PREFIX = '~/.sky/spot_controller'
-
-SPOT_TASK_YAML_PREFIX = '~/.sky/spot_tasks'
-
-# Resources as a dict for the spot controller.
-# Use default CPU instance type for spot controller with >= 24GB, i.e.
-# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
-# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
-# Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
-# OOM (each vCPU can have 4 spot controller processes as we set the CPU
-# requirement to 0.25, and 3 GB is barely enough for 4 spot processes).
-# We use 50 GB disk size to reduce the cost.
-CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
-
-# Max length of the cluster name for GCP is 35, the user hash to be attached is
-# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
-# length of the cluster name prefix is 25 to avoid the cluster name being too
-# long and truncated twice during the cluster creation.
-SPOT_CLUSTER_NAME_PREFIX_LENGTH = 25
diff --git a/sky/spot/dashboard/dashboard.py b/sky/spot/dashboard/dashboard.py
deleted file mode 100644
index b62cc523d4a..00000000000
--- a/sky/spot/dashboard/dashboard.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""Dashboard for spot jobs based on Flask.
-
-TODO(zongheng): This is a basic version. In the future we can beef up the web
-frameworks used (e.g.,
-https://github.com/ray-project/ray/tree/master/dashboard/client/src) and/or get
-rid of the SSH port-forwarding business (see cli.py's spot_dashboard()
-comment).
-"""
-import datetime
-import pathlib
-
-import flask
-import yaml
-
-import sky
-from sky import spot
-from sky.utils import common_utils
-
-app = flask.Flask(__name__)
-
-
-def _is_running_on_spot_controller() -> bool:
-    """Am I running on spot controller?
-
-    Loads ~/.sky/sky_ray.yml and check cluster_name.
-    """
-    if pathlib.Path('~/.sky/sky_ray.yml').expanduser().exists():
-        config = yaml.safe_load(
-            pathlib.Path('~/.sky/sky_ray.yml').expanduser().read_text())
-        return config.get('cluster_name', '').startswith('sky-spot-controller-')
-    return False
-
-
-@app.route('/')
-def home():
-    if not _is_running_on_spot_controller():
-        # Experimental: run on laptop (refresh is very slow).
-        all_spot_jobs = sky.spot_queue(refresh=True, skip_finished=False)
-    else:
-        job_table = spot.dump_spot_job_queue()
-        all_spot_jobs = spot.load_spot_job_queue(job_table)
-
-    timestamp = datetime.datetime.utcnow()
-    rows = spot.format_job_table(all_spot_jobs, show_all=True, return_rows=True)
-
-    # FIXME(zongheng): make the job table/queue funcs return structured info so
-    # that we don't have to do things like row[-5] below.
-    columns = [
-        'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
-        'Job Duration', 'Recoveries', 'Status', 'Started', 'Cluster', 'Region',
-        'Failure'
-    ]
-    if rows and len(rows[0]) != len(columns):
-        raise RuntimeError(
-            'Dashboard code and spot queue code are out of sync.')
-
-    # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'.
-    for row in rows:
-        row[-5] = common_utils.remove_color(row[-5])
-    # Remove filler rows ([''], ..., ['-']).
-    rows = [row for row in rows if ''.join(map(str, row)) != '']
-
-    rendered_html = flask.render_template(
-        'index.html',
-        columns=columns,
-        rows=rows,
-        last_updated_timestamp=timestamp,
-    )
-    return rendered_html
-
-
-if __name__ == '__main__':
-    app.run()
diff --git a/sky/task.py b/sky/task.py
index 9919bce4f38..b6a71581a15 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -271,9 +271,9 @@ def __init__(
                                                     int]] = None
         self.file_mounts: Optional[Dict[str, str]] = None
 
-        # Only set when 'self' is a spot controller task: 'self.spot_dag' is
-        # the underlying managed spot dag (sky.Dag object).
-        self.spot_dag: Optional['sky.Dag'] = None
+        # Only set when 'self' is a jobs controller task: 'self.managed_job_dag'
+        # is the underlying managed job dag (sky.Dag object).
+        self.managed_job_dag: Optional['sky.Dag'] = None
 
         # Only set when 'self' is a sky serve controller task.
         self.service_name: Optional[str] = None
@@ -548,10 +548,6 @@ def update_envs(
                                                        self._envs)
         return self
 
-    @property
-    def need_spot_recovery(self) -> bool:
-        return any(r.spot_recovery is not None for r in self.resources)
-
     @property
     def use_spot(self) -> bool:
         return any(r.use_spot for r in self.resources)
@@ -1007,8 +1003,8 @@ def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
         return d
 
     def is_controller_task(self) -> bool:
-        """Returns whether this task is a spot/serve controller process."""
-        return self.spot_dag is not None or self.service_name is not None
+        """Returns whether this task is a jobs/serve controller process."""
+        return self.managed_job_dag is not None or self.service_name is not None
 
     def get_cloud_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
         """Returns file mounts of the form (dst=VM path, src=cloud URL).
diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/jobs-controller.yaml.j2
similarity index 59%
rename from sky/templates/spot-controller.yaml.j2
rename to sky/templates/jobs-controller.yaml.j2
index edd14a618de..65ae07dcc55 100644
--- a/sky/templates/spot-controller.yaml.j2
+++ b/sky/templates/jobs-controller.yaml.j2
@@ -1,4 +1,4 @@
-# The template for the spot controller
+# The template for the jobs controller
 
 name: {{dag_name}}
 
@@ -15,17 +15,18 @@ setup: |
   {%- endfor %}
 
   {% if is_dev %}
-  # Internal: disable logging for manually logging into the spot controller for debugging.
+  # Internal: disable logging for manually logging into the jobs controller for debugging.
   echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
   {% endif %}
 
   # Dashboard.
+  ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
   pip list | grep flask  > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null
-  ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &));
+  ((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &));
 
 run: |
-  # Start the controller for the current spot job.
-  python -u -m sky.spot.controller {{remote_user_yaml_path}} \
+  # Start the controller for the current managed job.
+  python -u -m sky.jobs.controller {{remote_user_yaml_path}} \
     --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %}
 
 envs:
diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py
index 0cef1a8dbaf..2abefc6fea0 100644
--- a/sky/utils/common_utils.py
+++ b/sky/utils/common_utils.py
@@ -636,8 +636,12 @@ def fill_template(template_name: str, variables: Dict,
         fout.write(content)
 
 
-def deprecated_function(func: Callable, name: str, deprecated_name: str,
-                        removing_version: str) -> Callable:
+def deprecated_function(
+        func: Callable,
+        name: str,
+        deprecated_name: str,
+        removing_version: str,
+        override_argument: Optional[Dict[str, Any]] = None) -> Callable:
     """Decorator for creating deprecated functions, for backward compatibility.
 
     It will result in a warning being emitted when the function is used.
@@ -645,9 +649,14 @@ def deprecated_function(func: Callable, name: str, deprecated_name: str,
 
     @functools.wraps(func)
     def new_func(*args, **kwargs):
+        override_argument_str = ''
+        if override_argument:
+            override_argument_str = ', '.join(
+                f'{k}={v}' for k, v in override_argument.items())
         logger.warning(
             f'Call to deprecated function {deprecated_name}, which will be '
-            f'removed in {removing_version}. Please use {name}() instead.')
+            f'removed in {removing_version}. Please use '
+            f'{name}({override_argument_str}) instead.')
         return func(*args, **kwargs)
 
     return new_func
diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py
index fd675212717..b4a312ac1ab 100644
--- a/sky/utils/controller_utils.py
+++ b/sky/utils/controller_utils.py
@@ -17,14 +17,15 @@
 from sky import resources
 from sky import sky_logging
 from sky import skypilot_config
+from sky.adaptors import cloudflare
 from sky.clouds import gcp
 from sky.data import data_utils
 from sky.data import storage as storage_lib
+from sky.jobs import constants as managed_job_constants
+from sky.jobs import utils as managed_job_utils
 from sky.serve import constants as serve_constants
 from sky.serve import serve_utils
 from sky.skylet import constants
-from sky.spot import constants as spot_constants
-from sky.spot import spot_utils
 from sky.utils import common_utils
 from sky.utils import env_options
 from sky.utils import ux_utils
@@ -35,7 +36,7 @@
 
 logger = sky_logging.init_logger(__name__)
 
-# Message thrown when APIs sky.spot.launch(),sky.serve.up() received an invalid
+# Message thrown when APIs sky.jobs.launch(), sky.serve.up() received an invalid
 # controller resources spec.
 CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
     '{controller_type} controller resources is not valid, please check '
@@ -52,68 +53,92 @@ class _ControllerSpec:
     """Spec for skypilot controllers."""
     controller_type: str
     name: str
-    cluster_name: str
+    # Use a list of strings to support fallback to old names. The list is in the
+    # fallback order.
+    candidate_cluster_names: List[str]
     in_progress_hint: str
     decline_cancel_hint: str
-    decline_down_when_failed_to_fetch_status_hint: str
+    _decline_down_when_failed_to_fetch_status_hint: str
     decline_down_for_dirty_controller_hint: str
-    check_cluster_name_hint: str
+    _check_cluster_name_hint: str
     default_hint_if_non_existent: str
     connection_error_hint: str
     default_resources_config: Dict[str, Any]
 
+    @property
+    def cluster_name(self) -> str:
+        """The name in candidate_cluster_names that exists, else the first."""
+        for candidate_name in self.candidate_cluster_names:
+            record = global_user_state.get_cluster_from_name(candidate_name)
+            if record is not None:
+                return candidate_name
+        return self.candidate_cluster_names[0]
+
+    @property
+    def decline_down_when_failed_to_fetch_status_hint(self) -> str:
+        return self._decline_down_when_failed_to_fetch_status_hint.format(
+            cluster_name=self.cluster_name)
+
+    @property
+    def check_cluster_name_hint(self) -> str:
+        return self._check_cluster_name_hint.format(
+            cluster_name=self.cluster_name)
+
 
 class Controllers(enum.Enum):
     """Skypilot controllers."""
     # NOTE(dev): Keep this align with
     # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE
-    SPOT_CONTROLLER = _ControllerSpec(
-        controller_type='spot',
-        name='managed spot controller',
-        cluster_name=spot_utils.SPOT_CONTROLLER_NAME,
+    JOBS_CONTROLLER = _ControllerSpec(
+        controller_type='jobs',
+        name='managed jobs controller',
+        candidate_cluster_names=[
+            managed_job_utils.JOB_CONTROLLER_NAME,
+            managed_job_utils.LEGACY_JOB_CONTROLLER_NAME
+        ],
         in_progress_hint=(
-            '* {job_info}To see all spot jobs: '
-            f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'),
+            '* {job_info}To see all managed jobs: '
+            f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
         decline_cancel_hint=(
-            'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel '
-            f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel <spot '
-            f'job IDs> [--all]{colorama.Style.RESET_ALL}'),
-        decline_down_when_failed_to_fetch_status_hint=(
-            f'{colorama.Fore.RED}Tearing down the spot controller while '
-            'it is in INIT state is not supported (this means a spot launch '
+            'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
+            f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
+            f'<managed job IDs> [--all]{colorama.Style.RESET_ALL}'),
+        _decline_down_when_failed_to_fetch_status_hint=(
+            f'{colorama.Fore.RED}Tearing down the jobs controller while '
+            'it is in INIT state is not supported (this means a job launch '
             'is in progress or the previous launch failed), as we cannot '
-            'guarantee that all the spot jobs are finished. Please wait '
-            'until the spot controller is UP or fix it with '
+            'guarantee that all the managed jobs are finished. Please wait '
+            'until the jobs controller is UP or fix it with '
             f'{colorama.Style.BRIGHT}sky start '
-            f'{spot_utils.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'),
+            '{cluster_name}'
+            f'{colorama.Style.RESET_ALL}.'),
         decline_down_for_dirty_controller_hint=(
-            f'{colorama.Fore.RED}In-progress spot jobs found. To avoid '
+            f'{colorama.Fore.RED}In-progress managed jobs found. To avoid '
             f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}'
-            f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'),
-        check_cluster_name_hint=(
-            f'Cluster {spot_utils.SPOT_CONTROLLER_NAME} is reserved for '
-            'managed spot controller.'),
-        default_hint_if_non_existent='No in-progress spot jobs.',
+            f'sky jobs cancel -a{colorama.Style.RESET_ALL}\n'),
+        _check_cluster_name_hint=('Cluster {cluster_name} is reserved for '
+                                  'managed jobs controller.'),
+        default_hint_if_non_existent='No in-progress managed jobs.',
         connection_error_hint=(
-            'Failed to connect to spot controller, please try again later.'),
-        default_resources_config=spot_constants.CONTROLLER_RESOURCES)
+            'Failed to connect to jobs controller, please try again later.'),
+        default_resources_config=managed_job_constants.CONTROLLER_RESOURCES)
     SKY_SERVE_CONTROLLER = _ControllerSpec(
         controller_type='serve',
         name='serve controller',
-        cluster_name=serve_utils.SKY_SERVE_CONTROLLER_NAME,
+        candidate_cluster_names=[serve_utils.SKY_SERVE_CONTROLLER_NAME],
         in_progress_hint=(
             f'* To see detailed service status: {colorama.Style.BRIGHT}'
             f'sky serve status -a{colorama.Style.RESET_ALL}'),
         decline_cancel_hint=(
             'Cancelling the sky serve controller\'s jobs is not allowed.'),
-        decline_down_when_failed_to_fetch_status_hint=(
+        _decline_down_when_failed_to_fetch_status_hint=(
             f'{colorama.Fore.RED}Tearing down the sky serve controller '
             'while it is in INIT state is not supported (this means a sky '
             'serve up is in progress or the previous launch failed), as we '
             'cannot guarantee that all the services are terminated. Please '
             'wait until the sky serve controller is UP or fix it with '
             f'{colorama.Style.BRIGHT}sky start '
-            f'{serve_utils.SKY_SERVE_CONTROLLER_NAME}'
+            '{cluster_name}'
             f'{colorama.Style.RESET_ALL}.'),
         decline_down_for_dirty_controller_hint=(
             f'{colorama.Fore.RED}Tearing down the sky serve controller is not '
@@ -121,9 +146,8 @@ class Controllers(enum.Enum):
             '{service_names}. Please terminate the services first with '
             f'{colorama.Style.BRIGHT}sky serve down -a'
             f'{colorama.Style.RESET_ALL}.'),
-        check_cluster_name_hint=(
-            f'Cluster {serve_utils.SKY_SERVE_CONTROLLER_NAME} is reserved for '
-            'sky serve controller.'),
+        _check_cluster_name_hint=('Cluster {cluster_name} is reserved for '
+                                  'sky serve controller.'),
         default_hint_if_non_existent='No live services.',
         connection_error_hint=(
             'Failed to connect to serve controller, please try again later.'),
@@ -138,7 +162,7 @@ def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
             Otherwise, returns None.
         """
         for controller in cls:
-            if controller.value.cluster_name == name:
+            if name in controller.value.candidate_cluster_names:
                 return controller
         return None
 
@@ -160,40 +184,83 @@ def from_type(cls, controller_type: str) -> Optional['Controllers']:
 # can be cleaned up by another process.
 # TODO(zhwu): Keep the dependencies align with the ones in setup.py
 def _get_cloud_dependencies_installation_commands(
-        controller_type: str) -> List[str]:
-    commands = [
-        # aws
-        'pip list | grep boto3 > /dev/null 2>&1 || '
-        'pip install "urllib3<2" awscli>=1.27.10 botocore>=1.29.10 '
-        'boto3>=1.26.1 > /dev/null 2>&1',
-        # gcp
-        'pip list | grep google-api-python-client > /dev/null 2>&1 || '
-        'pip install google-api-python-client>=2.69.0 '
-        '> /dev/null 2>&1',
-        # Have to separate the installation of google-cloud-storage from above
-        # because for a VM launched on GCP, the VM may have
-        # google-api-python-client installed alone.
-        'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
-        'pip install google-cloud-storage > /dev/null 2>&1',
-        f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}',
-        # fluidstack does not need to install any cloud dependencies.
-    ]
-    # k8s and ibm doesn't support open port and spot instance yet, so we don't
-    # install them for either controller.
-    if controller_type == 'spot':
-        # oci doesn't support open port yet, so we don't install oci
-        # dependencies for sky serve controller.
-        commands.append('pip list | grep oci > /dev/null 2>&1 || '
-                        'pip install oci > /dev/null 2>&1')
+        controller: Controllers) -> List[str]:
     # TODO(tian): Make dependency installation command a method of cloud
     # class and get all installation command for enabled clouds.
-    if any(
-            cloud.is_same_cloud(clouds.Azure())
-            for cloud in sky_check.get_cached_enabled_clouds_or_refresh()):
-        commands.append(
-            'pip list | grep azure-cli > /dev/null 2>&1 || '
-            'pip install azure-cli>=2.31.0 azure-core azure-identity>=1.13.0 '
-            'azure-mgmt-network > /dev/null 2>&1')
+    commands = []
+    prefix_str = 'Check & install cloud dependencies on controller: '
+    # This is to make sure the shorter checking message does not have junk
+    # characters from the previous message.
+    empty_str = ' ' * 5
+    aws_dependencies_installation = (
+        'pip list | grep boto3 > /dev/null 2>&1 || pip install '
+        'botocore>=1.29.10 boto3>=1.26.1; '
+        # Need to separate the installation of awscli from above because some
+        # other clouds will install boto3 but not awscli.
+        'pip list | grep awscli> /dev/null 2>&1 || pip install "urllib3<2" '
+        'awscli>=1.27.10 "colorama<0.4.5" > /dev/null 2>&1')
+    for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
+        if isinstance(
+                clouds,
+            (clouds.Lambda, clouds.SCP, clouds.Fluidstack, clouds.Paperspace)):
+            # no need to install any cloud dependencies for lambda, scp,
+            # fluidstack and paperspace
+            continue
+        if isinstance(cloud, clouds.AWS):
+            commands.append(f'echo -n "{prefix_str}AWS{empty_str}" && ' +
+                            aws_dependencies_installation)
+        elif isinstance(cloud, clouds.Azure):
+            commands.append(
+                f'echo -en "\\r{prefix_str}Azure{empty_str}" && '
+                'pip list | grep azure-cli > /dev/null 2>&1 || '
+                'pip install "azure-cli>=2.31.0" azure-core '
+                '"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
+        elif isinstance(cloud, clouds.GCP):
+            commands.append(
+                f'echo -en "\\r{prefix_str}GCP{empty_str}" && '
+                'pip list | grep google-api-python-client > /dev/null 2>&1 || '
+                'pip install "google-api-python-client>=2.69.0" '
+                '> /dev/null 2>&1')
+            # Have to separate the installation of google-cloud-storage from
+            # above because for a VM launched on GCP, the VM may have
+            # google-api-python-client installed alone.
+            commands.append(
+                'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
+                'pip install google-cloud-storage > /dev/null 2>&1')
+            commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
+        if controller == Controllers.JOBS_CONTROLLER:
+            if isinstance(cloud, clouds.IBM):
+                commands.append(
+                    f'echo -en "\\r{prefix_str}IBM{empty_str}" '
+                    '&& pip list | grep ibm-cloud-sdk-core > /dev/null 2>&1 || '
+                    'pip install ibm-cloud-sdk-core ibm-vpc '
+                    'ibm-platform-services ibm-cos-sdk > /dev/null 2>&1')
+            elif isinstance(cloud, clouds.OCI):
+                commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && '
+                                'pip list | grep oci > /dev/null 2>&1 || '
+                                'pip install oci > /dev/null 2>&1')
+            elif isinstance(cloud, clouds.Kubernetes):
+                commands.append(
+                    f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && '
+                    'pip list | grep kubernetes > /dev/null 2>&1 || '
+                    'pip install "kubernetes>=20.0.0" > /dev/null 2>&1')
+            elif isinstance(cloud, clouds.RunPod):
+                commands.append(
+                    f'echo -en "\\r{prefix_str}RunPod{empty_str}" && '
+                    'pip list | grep runpod > /dev/null 2>&1 || '
+                    'pip install "runpod>=1.5.1" > /dev/null 2>&1')
+            elif isinstance(cloud, clouds.Cudo):
+                # cudo doesn't support open port
+                commands.append(
+                    f'echo -en "\\r{prefix_str}Cudo{empty_str}" && '
+                    'pip list | grep cudo-compute > /dev/null 2>&1 || '
+                    'pip install "cudo-compute>=0.1.8" > /dev/null 2>&1')
+    if (cloudflare.NAME
+            in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
+        commands.append(f'echo -en "\\r{prefix_str}Cloudflare{empty_str}" && ' +
+                        aws_dependencies_installation)
+    commands.append(f'echo -e "\\r{prefix_str}Done for {len(commands)} '
+                    'clouds."')
     return commands
 
 
@@ -226,7 +293,7 @@ def download_and_stream_latest_job_log(
         local_dir: str) -> Optional[str]:
     """Downloads and streams the latest job log.
 
-    This function is only used by spot controller and sky serve controller.
+    This function is only used by jobs controller and sky serve controller.
     """
     os.makedirs(local_dir, exist_ok=True)
     log_file = None
@@ -234,11 +301,11 @@ def download_and_stream_latest_job_log(
         log_dirs = backend.sync_down_logs(
             handle,
             # Download the log of the latest job.
-            # The job_id for the spot job running on the spot cluster is not
+            # The job_id for the managed job running on the cluster is not
             # necessarily 1, as it is possible that the worker node in a
-            # multi-node cluster is preempted, and we recover the spot job
+            # multi-node cluster is preempted, and we recover the managed job
             # on the existing cluster, which leads to a larger job_id. Those
-            # job_ids all represent the same logical spot job.
+            # job_ids all represent the same logical managed job.
             job_ids=None,
             local_dir=local_dir)
     except exceptions.CommandError as e:
@@ -262,10 +329,11 @@ def download_and_stream_latest_job_log(
 
 
 def shared_controller_vars_to_fill(
-        controller_type: str, remote_user_config_path: str) -> Dict[str, str]:
+        controller: Controllers,
+        remote_user_config_path: str) -> Dict[str, str]:
     vars_to_fill: Dict[str, Any] = {
         'cloud_dependencies_installation_commands':
-            _get_cloud_dependencies_installation_commands(controller_type)
+            _get_cloud_dependencies_installation_commands(controller)
     }
     env_vars: Dict[str, str] = {
         env.value: '1' for env in env_options.Options if env.get()
@@ -287,7 +355,7 @@ def shared_controller_vars_to_fill(
 
 
 def get_controller_resources(
-    controller_type: str,
+    controller: Controllers,
     task_resources: Iterable['resources.Resources'],
 ) -> Set['resources.Resources']:
     """Read the skypilot config and setup the controller resources.
@@ -299,18 +367,20 @@ def get_controller_resources(
         specified, the controller will be launched on one of the clouds
         of the task resources for better connectivity.
     """
-    controller = Controllers.from_type(controller_type)
-    assert controller is not None, controller_type
     controller_resources_config_copied: Dict[str, Any] = copy.copy(
         controller.value.default_resources_config)
     if skypilot_config.loaded():
         # Override the controller resources with the ones specified in the
         # config.
         custom_controller_resources_config = skypilot_config.get_nested(
-            (controller_type, 'controller', 'resources'), None)
+            (controller.value.controller_type, 'controller', 'resources'), None)
         if custom_controller_resources_config is not None:
             controller_resources_config_copied.update(
                 custom_controller_resources_config)
+        elif controller == Controllers.JOBS_CONTROLLER:
+            controller_resources_config_copied.update(
+                skypilot_config.get_nested(('spot', 'controller', 'resources'),
+                                           {}))
 
     try:
         controller_resources = resources.Resources.from_yaml_config(
@@ -319,9 +389,9 @@ def get_controller_resources(
         with ux_utils.print_exception_no_traceback():
             raise ValueError(
                 CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format(
-                    controller_type=controller_type,
-                    err=common_utils.format_exception(e,
-                                                      use_bracket=True))) from e
+                    controller_type=controller.value.controller_type,
+                    err=common_utils.format_exception(
+                        e, use_bracket=True)).capitalize()) from e
     # TODO(tian): Support multiple resources for the controller. One blocker
     # here is the semantic if controller resources use `ordered` and we want
     # to override it with multiple cloud from task resources.
@@ -329,10 +399,10 @@ def get_controller_resources(
         with ux_utils.print_exception_no_traceback():
             raise ValueError(
                 CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format(
-                    controller_type=controller_type,
+                    controller_type=controller.value.controller_type,
                     err=f'Expected exactly one resource, got '
                     f'{len(controller_resources)} resources: '
-                    f'{controller_resources}'))
+                    f'{controller_resources}').capitalize())
     controller_resources_to_use: resources.Resources = list(
         controller_resources)[0]
 
@@ -404,18 +474,18 @@ def _setup_proxy_command_on_controller(
     # to this scenario.)
     #
     # This file will be uploaded to the controller node and will be
-    # used throughout the spot job's / service's recovery attempts
+    # used throughout the managed job's / service's recovery attempts
     # (i.e., if it relaunches due to preemption, we make sure the
     # same config is used).
     #
     # NOTE: suppose that we have a controller in old VPC, then user
-    # changes 'vpc_name' in the config and does a 'spot launch' /
+    # changes 'vpc_name' in the config and does a 'job launch' /
     # 'serve up'. In general, the old controller may not successfully
     # launch the job in the new VPC. This happens if the two VPCs don’t
     # have peering set up. Like other places in the code, we assume
     # properly setting up networking is user's responsibilities.
     # TODO(zongheng): consider adding a basic check that checks
-    # controller VPC (or name) == the spot job's / service's VPC
+    # controller VPC (or name) == the managed job's / service's VPC
     # (or name). It may not be a sufficient check (as it's always
     # possible that peering is not set up), but it may catch some
     # obvious errors.
@@ -469,7 +539,7 @@ def replace_skypilot_config_path_in_file_mounts(
 
 
 def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
-                                                  path: str):
+                                                  path: str) -> None:
     """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
 
     Eagerly syncing up local->Storage ensures Storage->VM would work at task
@@ -595,7 +665,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
     # Step 4: Upload storage from sources
     # Upload the local source to a bucket. The task will not be executed
     # locally, so we need to upload the files/folders to the bucket manually
-    # here before sending the task to the remote spot controller.
+    # here before sending the task to the remote jobs controller.
     if task.storage_mounts:
         # There may be existing (non-translated) storage mounts, so log this
         # whenever task.storage_mounts is non-empty.
diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py
index 9803821d8bb..ef80bfd2a17 100644
--- a/sky/utils/dag_utils.py
+++ b/sky/utils/dag_utils.py
@@ -3,8 +3,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 from sky import dag as dag_lib
+from sky import jobs
 from sky import sky_logging
-from sky import spot
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.utils import common_utils
@@ -142,33 +142,24 @@ def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
                 task.name = f'{dag.name}-{task_id}'
 
 
-def fill_default_spot_config_in_dag_for_spot_launch(dag: dag_lib.Dag) -> None:
+def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
     for task_ in dag.tasks:
 
         new_resources_list = []
         for resources in list(task_.resources):
             change_default_value: Dict[str, Any] = {}
-            if resources.use_spot_specified and not resources.use_spot:
-                logger.info(
-                    'Field `use_spot` is set to false but a managed spot job is '  # pylint: disable=line-too-long
-                    'being launched. Ignoring the field and proceeding to use spot '  # pylint: disable=line-too-long
-                    'instance(s).')
-            change_default_value['use_spot'] = True
-            if resources.spot_recovery is None:
+            if resources.job_recovery is None:
                 change_default_value[
-                    'spot_recovery'] = spot.SPOT_DEFAULT_STRATEGY
+                    'job_recovery'] = jobs.DEFAULT_RECOVERY_STRATEGY
 
             new_resources = resources.copy(**change_default_value)
             new_resources_list.append(new_resources)
 
-        spot_recovery_strategy = new_resources_list[0].spot_recovery
+        job_recovery_strategy = new_resources_list[0].job_recovery
         for resource in new_resources_list:
-            if resource.spot_recovery != spot_recovery_strategy:
+            if resource.job_recovery != job_recovery_strategy:
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError('All resources in the task must have'
-                                     'the same spot recovery strategy.')
+                                     'the same job recovery strategy.')
 
-        if isinstance(task_.resources, list):
-            task_.set_resources(new_resources_list)
-        else:
-            task_.set_resources(set(new_resources_list))
+        task_.set_resources(type(task_.resources)(new_resources_list))
diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py
index 522d076c171..166bf42ce80 100644
--- a/sky/utils/env_options.py
+++ b/sky/utils/env_options.py
@@ -9,11 +9,11 @@ class Options(enum.Enum):
     SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG'
     DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION'
     MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING'
-    # Internal: this is used to skip the cloud user identity check,
-    # which is used to protect cluster operations in a multi-identity
-    # scenario. Currently, this is only used in the spot controller,
-    # as there will not be multiple identities, and skipping the check
-    # can increase robustness.
+    # Internal: this is used to skip the cloud user identity check, which is
+    # used to protect cluster operations in a multi-identity scenario.
+    # Currently, this is only used in the job and serve controller, as there
+    # will not be multiple identities, and skipping the check can increase
+    # robustness.
     SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
 
     def get(self):
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 6c3d84fe857..d02436619c3 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -82,9 +82,14 @@ def _get_single_resources_schema():
             'use_spot': {
                 'type': 'boolean',
             },
+            # Deprecated: use 'job_recovery' instead. This is for backward
+            # compatibility, and can be removed in 0.8.0.
             'spot_recovery': {
                 'type': 'string',
             },
+            'job_recovery': {
+                'type': 'string',
+            },
             'disk_size': {
                 'type': 'integer',
             },
@@ -218,7 +223,9 @@ def get_resources_schema():
                 'type': 'array',
                 'items': multi_resources_schema,
             }
-        }
+        },
+        # Avoid job_recovery and spot_recovery being present at the same time.
+        **_check_not_both_fields_present('job_recovery', 'spot_recovery')
     }
 
 
@@ -661,8 +668,11 @@ def get_config_schema():
         'required': [],
         'additionalProperties': False,
         'properties': {
+            'jobs': controller_resources_schema,
             'spot': controller_resources_schema,
             'serve': controller_resources_schema,
             **cloud_configs,
-        }
+        },
+        # Avoid spot and jobs being present at the same time.
+        **_check_not_both_fields_present('spot', 'jobs')
     }
diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh
index bd59718cfad..340c9f9826e 100644
--- a/tests/backward_compatibility_tests.sh
+++ b/tests/backward_compatibility_tests.sh
@@ -158,35 +158,36 @@ sky logs ${CLUSTER_NAME}-6 2 --status
 sky logs ${CLUSTER_NAME}-6 2
 fi
 
-# Test spot jobs to make sure existing jobs and new job can run correctly, after
-# the spot controller is updated.
+# Test managed jobs to make sure existing jobs and new job can run correctly,
+# after the jobs controller is updated.
 # Get a new uuid to avoid conflict with previous back-compat tests.
 uuid=$(uuidgen)
-SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}
+MANAGED_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}
 if [ "$start_from" -le 7 ]; then
 conda activate sky-back-compat-master
 rm -r  ~/.sky/wheels || true
-sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000"
-sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-1 "echo hi; sleep 300"
+sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${MANAGED_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000"
+sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${MANAGED_JOB_JOB_NAME}-7-1 "echo hi; sleep 300"
 conda activate sky-back-compat-current
 rm -r  ~/.sky/wheels || true
-s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-1)
+s=$(sky jobs queue | grep ${MANAGED_JOB_JOB_NAME}-7 | grep "RUNNING" | wc -l)
+s=$(sky jobs logs --no-follow -n ${MANAGED_JOB_JOB_NAME}-7-1)
 echo "$s"
 echo "$s" | grep " hi" || exit 1
-sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 40"
-s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-2)
+sky jobs launch -d --cloud ${CLOUD} -y -n ${MANAGED_JOB_JOB_NAME}-7-2 "echo hi; sleep 40"
+s=$(sky jobs logs --no-follow -n ${MANAGED_JOB_JOB_NAME}-7-2)
 echo "$s"
 echo "$s" | grep " hi" || exit 1
-s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
+s=$(sky jobs queue | grep ${MANAGED_JOB_JOB_NAME}-7)
 echo "$s"
 echo "$s" | grep "RUNNING" | wc -l | grep 3 || exit 1
-sky spot cancel -y -n ${SPOT_JOB_JOB_NAME}-7-0
-sky spot logs -n "${SPOT_JOB_JOB_NAME}-7-1"
-s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
+sky jobs cancel -y -n ${MANAGED_JOB_JOB_NAME}-7-0
+sky jobs logs -n "${MANAGED_JOB_JOB_NAME}-7-1"
+s=$(sky jobs queue | grep ${MANAGED_JOB_JOB_NAME}-7)
 echo "$s"
 echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1
 echo "$s" | grep "CANCELLED" | wc -l | grep 1 || exit 1
 fi
 
 sky down ${CLUSTER_NAME}* -y
-sky spot cancel -n ${SPOT_JOB_JOB_NAME}* -y
+sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y
diff --git a/tests/conftest.py b/tests/conftest.py
index 2c72d449977..a7b7f10217a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,8 +18,8 @@
 # To only run tests for a specific cloud (as well as generic tests), use
 # --aws, --gcp, --azure, or --lambda.
 #
-# To only run tests for managed spot (without generic tests), use
-# --managed-spot.
+# To only run tests for managed jobs (without generic tests), use
+# --managed-jobs.
 all_clouds_in_smoke_tests = [
     'aws', 'gcp', 'azure', 'lambda', 'cloudflare', 'ibm', 'scp', 'oci',
     'kubernetes', 'vsphere', 'cudo', 'fluidstack', 'paperspace'
@@ -57,10 +57,10 @@ def pytest_addoption(parser):
                          action='store_true',
                          default=False,
                          help=f'Only run {cloud.upper()} tests.')
-    parser.addoption('--managed-spot',
+    parser.addoption('--managed-jobs',
                      action='store_true',
                      default=False,
-                     help='Only run tests for managed spot.')
+                     help='Only run tests for managed jobs.')
     parser.addoption('--serve',
                      action='store_true',
                      default=False,
@@ -115,8 +115,8 @@ def _get_cloud_to_run(config) -> List[str]:
 def pytest_collection_modifyitems(config, items):
     skip_marks = {}
     skip_marks['slow'] = pytest.mark.skip(reason='need --runslow option to run')
-    skip_marks['managed_spot'] = pytest.mark.skip(
-        reason='skipped, because --managed-spot option is set')
+    skip_marks['managed_jobs'] = pytest.mark.skip(
+        reason='skipped, because --managed-jobs option is set')
     skip_marks['serve'] = pytest.mark.skip(
         reason='skipped, because --serve option is set')
     skip_marks['tpu'] = pytest.mark.skip(
@@ -144,9 +144,9 @@ def pytest_collection_modifyitems(config, items):
                     continue
                 item.add_marker(skip_marks[cloud])
 
-        if (not 'managed_spot'
-                in item.keywords) and config.getoption('--managed-spot'):
-            item.add_marker(skip_marks['managed_spot'])
+        if (not 'managed_jobs'
+                in item.keywords) and config.getoption('--managed-jobs'):
+            item.add_marker(skip_marks['managed_jobs'])
         if (not 'tpu' in item.keywords) and config.getoption('--tpu'):
             item.add_marker(skip_marks['tpu'])
         if (not 'serve' in item.keywords) and config.getoption('--serve'):
diff --git a/tests/test_spot_serve.py b/tests/test_jobs_and_serve.py
similarity index 85%
rename from tests/test_spot_serve.py
rename to tests/test_jobs_and_serve.py
index 15dcde6a8de..61d8a9f0a98 100644
--- a/tests/test_spot_serve.py
+++ b/tests/test_jobs_and_serve.py
@@ -14,20 +14,20 @@
 from sky import cli
 from sky import exceptions
 from sky import global_user_state
+from sky import jobs
 from sky import serve
-from sky import spot
 from sky.utils import common_utils
 from sky.utils import controller_utils
 from sky.utils import db_utils
 
 
-def test_spot_nonexist_strategy():
+def test_job_nonexist_strategy():
     """Test the nonexist recovery strategy."""
     task_yaml = textwrap.dedent("""\
         resources:
             cloud: aws
             use_spot: true
-            spot_recovery: nonexist""")
+            job_recovery: nonexist""")
     with tempfile.NamedTemporaryFile(mode='w') as f:
         f.write(task_yaml)
         f.flush()
@@ -95,18 +95,18 @@ def _mock_cluster_state(_mock_db_conn):
 
 
 @pytest.fixture
-def _mock_spot_controller(_mock_db_conn):
+def _mock_jobs_controller(_mock_db_conn):
     handle = backends.CloudVmRayResourceHandle(
-        cluster_name=spot.SPOT_CONTROLLER_NAME,
-        cluster_name_on_cloud=spot.SPOT_CONTROLLER_NAME,
-        cluster_yaml='/tmp/spot_controller.yaml',
+        cluster_name=jobs.JOB_CONTROLLER_NAME,
+        cluster_name_on_cloud=jobs.JOB_CONTROLLER_NAME,
+        cluster_yaml='/tmp/jobs_controller.yaml',
         launched_nodes=1,
         launched_resources=sky.Resources(sky.AWS(),
                                          instance_type='m4.2xlarge',
                                          region='us-west-1'),
     )
     global_user_state.add_or_update_cluster(
-        spot.SPOT_CONTROLLER_NAME,
+        jobs.JOB_CONTROLLER_NAME,
         handle,
         requested_resources={handle.launched_resources},
         ready=True)
@@ -131,22 +131,22 @@ def _mock_serve_controller(_mock_db_conn):
 
 
 def mock_is_controller_accessible(
-    controller_type: controller_utils.Controllers,
+    controller: controller_utils.Controllers,
     stopped_message: str,
     non_existent_message: Optional[str] = None,
     exit_on_error: bool = False,
 ):
     record = global_user_state.get_cluster_from_name(
-        controller_type.value.cluster_name)
+        controller.value.cluster_name)
     return record['handle']
 
 
-class TestSpotOperations:
-    """Test operations for managed spot."""
+class TestJobsOperations:
+    """Test operations for managed jobs."""
 
     @pytest.mark.timeout(60)
-    def test_down_spot_controller(self, _mock_cluster_state,
-                                  _mock_spot_controller, monkeypatch):
+    def test_down_jobs_controller(self, _mock_cluster_state,
+                                  _mock_jobs_controller, monkeypatch):
 
         def mock_get_job_table_no_job(cls, handle, code, require_outputs,
                                       stream_logs,
@@ -168,7 +168,7 @@ def mock_get_job_table_one_job(cls, handle, code, require_outputs,
                 'last_recovered_at': None,
                 'recovery_count': 0,
                 'failure_reason': '',
-                'spot_job_id': '1',
+                'managed_job_id': '1',
                 'task_id': 0,
                 'task_name': 'test_task',
                 'job_duration': 20,
@@ -183,9 +183,9 @@ def mock_get_job_table_one_job(cls, handle, code, require_outputs,
             mock_get_job_table_no_job)
 
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME],
+        result = cli_runner.invoke(cli.down, [jobs.JOB_CONTROLLER_NAME],
                                    input='n')
-        assert 'WARNING: Tearing down the managed spot controller.' in result.output, (
+        assert 'WARNING: Tearing down the managed jobs controller.' in result.output, (
             result.exception, result.output, result.exc_info)
         assert isinstance(result.exception,
                           SystemExit), (result.exception, result.output)
@@ -193,17 +193,17 @@ def mock_get_job_table_one_job(cls, handle, code, require_outputs,
         monkeypatch.setattr(
             'sky.backends.cloud_vm_ray_backend.CloudVmRayBackend.run_on_head',
             mock_get_job_table_one_job)
-        result = cli_runner.invoke(cli.down, [spot.SPOT_CONTROLLER_NAME],
+        result = cli_runner.invoke(cli.down, [jobs.JOB_CONTROLLER_NAME],
                                    input='n')
-        assert 'WARNING: Tearing down the managed spot controller.' in result.output, (
+        assert 'WARNING: Tearing down the managed jobs controller.' in result.output, (
             result.exception, result.output, result.exc_info)
         assert isinstance(result.exception, exceptions.NotSupportedError)
 
-        result = cli_runner.invoke(cli.down, ['sky-spot-con*'])
+        result = cli_runner.invoke(cli.down, ['sky-jobs-con*'])
         assert not result.exception
         assert 'Cluster(s) not found' in result.output
 
-        result = cli_runner.invoke(cli.down, ['sky-spot-con*', '-p'])
+        result = cli_runner.invoke(cli.down, ['sky-jobs-con*', '-p'])
         assert not result.exception
         assert 'Cluster(s) not found' in result.output
 
@@ -216,15 +216,15 @@ def mock_get_job_table_one_job(cls, handle, code, require_outputs,
         assert 'Aborted' in result.output
 
     @pytest.mark.timeout(60)
-    def test_stop_spot_controller(self, _mock_cluster_state,
-                                  _mock_spot_controller):
+    def test_stop_jobs_controller(self, _mock_cluster_state,
+                                  _mock_jobs_controller):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.stop, [spot.SPOT_CONTROLLER_NAME])
+        result = cli_runner.invoke(cli.stop, [jobs.JOB_CONTROLLER_NAME])
         assert result.exit_code == click.UsageError.exit_code
-        assert (f'Stopping controller(s) \'{spot.SPOT_CONTROLLER_NAME}\' is '
+        assert (f'Stopping controller(s) \'{jobs.JOB_CONTROLLER_NAME}\' is '
                 'currently not supported' in result.output)
 
-        result = cli_runner.invoke(cli.stop, ['sky-spot-con*'])
+        result = cli_runner.invoke(cli.stop, ['sky-jobs-con*'])
         assert not result.exception
         assert 'Cluster(s) not found' in result.output
 
@@ -233,16 +233,16 @@ def test_stop_spot_controller(self, _mock_cluster_state,
         assert 'Aborted' in result.output
 
     @pytest.mark.timeout(60)
-    def test_autostop_spot_controller(self, _mock_cluster_state,
-                                      _mock_spot_controller):
+    def test_autostop_jobs_controller(self, _mock_cluster_state,
+                                      _mock_jobs_controller):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.autostop, [spot.SPOT_CONTROLLER_NAME])
+        result = cli_runner.invoke(cli.autostop, [jobs.JOB_CONTROLLER_NAME])
         assert result.exit_code == click.UsageError.exit_code
         assert ('Scheduling autostop on controller(s) '
-                f'\'{spot.SPOT_CONTROLLER_NAME}\' is currently not supported'
+                f'\'{jobs.JOB_CONTROLLER_NAME}\' is currently not supported'
                 in result.output)
 
-        result = cli_runner.invoke(cli.autostop, ['sky-spot-con*'])
+        result = cli_runner.invoke(cli.autostop, ['sky-jobs-con*'])
         assert not result.exception
         assert 'Cluster(s) not found' in result.output
 
@@ -250,37 +250,36 @@ def test_autostop_spot_controller(self, _mock_cluster_state,
         assert isinstance(result.exception, SystemExit)
         assert 'Aborted' in result.output
 
-    def test_cancel_on_spot_controller(self, _mock_cluster_state,
-                                       _mock_spot_controller):
+    def test_cancel_on_jobs_controller(self, _mock_cluster_state,
+                                       _mock_jobs_controller):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.cancel,
-                                   [spot.SPOT_CONTROLLER_NAME, '-a'])
+        result = cli_runner.invoke(cli.cancel, [jobs.JOB_CONTROLLER_NAME, '-a'])
         assert result.exit_code == 1
-        assert 'Cancelling the spot controller\'s jobs is not allowed.' in str(
+        assert 'Cancelling the jobs controller\'s jobs is not allowed.' in str(
             result.output)
 
     @pytest.mark.timeout(60)
     def test_cancel(self, _mock_db_conn):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.spot_cancel, ['-a'])
+        result = cli_runner.invoke(cli.jobs_cancel, ['-a'])
         assert result.exit_code == 1
-        assert controller_utils.Controllers.SPOT_CONTROLLER.value.default_hint_if_non_existent in str(
+        assert controller_utils.Controllers.JOBS_CONTROLLER.value.default_hint_if_non_existent in str(
             result.output), (result.exception, result.output, result.exc_info)
 
     @pytest.mark.timeout(60)
     def test_logs(self, _mock_db_conn):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.spot_logs, ['1'])
+        result = cli_runner.invoke(cli.jobs_logs, ['1'])
         assert result.exit_code == 1
-        assert controller_utils.Controllers.SPOT_CONTROLLER.value.default_hint_if_non_existent in str(
+        assert controller_utils.Controllers.JOBS_CONTROLLER.value.default_hint_if_non_existent in str(
             result.output), (result.exception, result.output, result.exc_info)
 
     @pytest.mark.timeout(60)
     def test_queue(self, _mock_db_conn):
         cli_runner = cli_testing.CliRunner()
-        result = cli_runner.invoke(cli.spot_queue)
+        result = cli_runner.invoke(cli.jobs_queue)
         assert result.exit_code == 0
-        assert controller_utils.Controllers.SPOT_CONTROLLER.value.default_hint_if_non_existent in str(
+        assert controller_utils.Controllers.JOBS_CONTROLLER.value.default_hint_if_non_existent in str(
             result.output), (result.exception, result.output, result.exc_info)
 
 
@@ -288,8 +287,8 @@ class TestServeOperations:
     """Test operations for services."""
 
     @pytest.mark.timeout(60)
-    def test_down_spot_controller(self, _mock_cluster_state,
-                                  _mock_serve_controller, monkeypatch):
+    def test_down_serve_controller(self, _mock_cluster_state,
+                                   _mock_serve_controller, monkeypatch):
 
         def mock_get_services_no_service(
                 cls, handle, code, require_outputs, stream_logs,
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index e76ec75bb95..78305f4d559 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -13,8 +13,8 @@
 # Run one of the smoke tests
 # > pytest tests/test_smoke.py::test_minimal
 #
-# Only run managed spot tests
-# > pytest tests/test_smoke.py --managed-spot
+# Only run managed job tests
+# > pytest tests/test_smoke.py --managed-jobs
 #
 # Only run sky serve tests
 # > pytest tests/test_smoke.py --sky-serve
@@ -45,8 +45,8 @@
 
 import sky
 from sky import global_user_state
+from sky import jobs
 from sky import serve
-from sky import spot
 from sky.adaptors import cloudflare
 from sky.adaptors import ibm
 from sky.clouds import AWS
@@ -61,10 +61,10 @@
 from sky.utils import subprocess_utils
 
 # To avoid the second smoke test reusing the cluster launched in the first
-# smoke test. Also required for test_spot_recovery to make sure the manual
-# termination with aws ec2 does not accidentally terminate other spot clusters
-# from the different spot launch with the same cluster name but a different job
-# id.
+# smoke test. Also required for test_managed_jobs_recovery to make sure the
+# manual termination with aws ec2 does not accidentally terminate other clusters
+# for for the different managed jobs launch with the same job name but a
+# different job id.
 test_id = str(uuid.uuid4())[-2:]
 
 LAMBDA_TYPE = '--cloud lambda --gpus A10'
@@ -81,22 +81,24 @@
     'touch ~/.ssh/id_rsa.pub'
 ]
 
-# Wait until the spot controller is not in INIT state.
-# This is a workaround for the issue that when multiple spot tests
-# are running in parallel, the spot controller may be in INIT and
-# the spot queue/cancel command will return staled table.
-_SPOT_QUEUE_WAIT = ('s=$(sky spot queue); '
-                    'until ! echo "$s" | grep "jobs will not be shown until"; '
-                    'do echo "Waiting for spot queue to be ready..."; '
-                    'sleep 5; s=$(sky spot queue); done; echo "$s"; '
-                    'echo; echo; echo "$s"')
-_SPOT_CANCEL_WAIT = (
-    's=$(sky spot cancel -y -n {job_name}); '
+# Wait until the jobs controller is not in INIT state.
+# This is a workaround for the issue that when multiple job tests
+# are running in parallel, the jobs controller may be in INIT and
+# the job queue/cancel command will return staled table.
+_JOB_QUEUE_WAIT = ('s=$(sky jobs queue); '
+                   'until ! echo "$s" | grep "jobs will not be shown until"; '
+                   'do echo "Waiting for job queue to be ready..."; '
+                   'sleep 5; s=$(sky jobs queue); done; echo "$s"; '
+                   'echo; echo; echo "$s"')
+_JOB_CANCEL_WAIT = (
+    's=$(sky jobs cancel -y -n {job_name}); '
     'until ! echo "$s" | grep "Please wait for the controller to be ready."; '
-    'do echo "Waiting for the spot controller '
-    'to be ready"; sleep 5; s=$(sky spot cancel -y -n {job_name}); '
+    'do echo "Waiting for the jobs controller '
+    'to be ready"; sleep 5; s=$(sky jobs cancel -y -n {job_name}); '
     'done; echo "$s"; echo; echo; echo "$s"')
-# TODO(zhwu): make the spot controller on GCP.
+# TODO(zhwu): make the jobs controller on GCP, to avoid parallel test issues
+# when the controller being on Azure, which takes a long time for launching
+# step.
 
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
@@ -278,14 +280,14 @@ def test_minimal(generic_cloud: str):
             f'sky logs {name} 1 --status',
             f'sky logs {name} --status | grep "Job 1: SUCCEEDED"',  # Equivalent.
             # Check the logs downloading
-            f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo $log_path && test -f $log_path/run.log',
+            f'log_path=$(sky logs {name} 1 --sync-down | grep "Job 1 logs:" | sed -E "s/^.*Job 1 logs: (.*)\\x1b\\[0m/\\1/g") && echo "$log_path" && test -f $log_path/run.log',
             # Ensure the raylet process has the correct file descriptor limit.
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
             # Check the cluster info
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cluster_name | grep {name}\'',
+            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cluster_name | grep {name}\'',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i {generic_cloud}\'',
+            f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
         ],
         f'sky down -y {name}',
@@ -1008,10 +1010,10 @@ def test_kubernetes_storage_mounts():
 
 
 @pytest.mark.parametrize(
-    "image_id",
+    'image_id',
     [
-        "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
-        "docker:ubuntu:18.04",
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
         # Test latest image with python 3.11 installed by default.
         # Does not work for python 3.12 due to ray's requirement for 3.11.
         'docker:continuumio/miniconda3:24.1.2-0',
@@ -1190,10 +1192,10 @@ def test_job_queue(generic_cloud: str):
 @pytest.mark.no_oci  # Doesn't support OCI for now
 @pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
 @pytest.mark.parametrize(
-    "image_id",
+    'image_id',
     [
-        "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
-        "docker:ubuntu:18.04",
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
         # Test latest image with python 3.11 installed by default.
         # Does not work for python 3.12 due to ray's requirement for 3.11.
         'docker:continuumio/miniconda3:24.1.2-0',
@@ -1917,7 +1919,7 @@ def test_azure_start_stop():
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             'sleep 200',
-            f's=$(sky status -r {name}) && echo $s && echo $s | grep "INIT\|STOPPED"'
+            f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"'
         ],
         f'sky down -y {name}',
         timeout=30 * 60,  # 30 mins
@@ -2220,38 +2222,31 @@ def test_stop_gcp_spot():
     run_one_test(test)
 
 
-# ---------- Testing managed spot ----------
-@pytest.mark.no_fluidstack  # FluidStack does not support spot instances
-@pytest.mark.no_azure  # Azure does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Papperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot(generic_cloud: str):
-    """Test the spot yaml."""
+# ---------- Testing managed job ----------
+@pytest.mark.managed_jobs
+def test_managed_jobs(generic_cloud: str):
+    """Test the managed jobs yaml."""
     name = _get_cluster_name()
     test = Test(
-        'managed-spot',
+        'managed-jobs',
         [
-            f'sky spot launch -n {name}-1 --cloud {generic_cloud} examples/managed_spot.yaml -y -d',
-            f'sky spot launch -n {name}-2 --cloud {generic_cloud} examples/managed_spot.yaml -y -d',
+            f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
+            f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "STARTING\|RUNNING"',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "STARTING\|RUNNING"',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}-1'),
+            f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "STARTING\|RUNNING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "STARTING\|RUNNING"',
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}-1'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep CANCELLED',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-1 | head -n1 | grep CANCELLED',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
         ],
-        # TODO(zhwu): Change to _SPOT_CANCEL_WAIT.format(job_name=f'{name}-1 -n {name}-2') when
+        # TODO(zhwu): Change to _JOB_CANCEL_WAIT.format(job_name=f'{name}-1 -n {name}-2') when
         # canceling multiple job names is supported.
-        (_SPOT_CANCEL_WAIT.format(job_name=f'{name}-1') + '; ' +
-         _SPOT_CANCEL_WAIT.format(job_name=f'{name}-2')),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        (_JOB_CANCEL_WAIT.format(job_name=f'{name}-1') + '; ' +
+         _JOB_CANCEL_WAIT.format(job_name=f'{name}-2')),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
     run_one_test(test)
@@ -2264,36 +2259,36 @@ def test_spot(generic_cloud: str):
 @pytest.mark.no_scp  # SCP does not support spot instances
 @pytest.mark.no_paperspace  # Paperspace does not support spot instances
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_pipeline(generic_cloud: str):
-    """Test a spot pipeline."""
+@pytest.mark.managed_jobs
+def test_job_pipeline(generic_cloud: str):
+    """Test a job pipeline."""
     name = _get_cluster_name()
     test = Test(
         'spot-pipeline',
         [
-            f'sky spot launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml -y -d',
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING\|RUNNING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING\|RUNNING"',
             # `grep -A 4 {name}` finds the job with {name} and the 4 lines
             # after it, i.e. the 4 tasks within the job.
             # `sed -n 2p` gets the second line of the 4 lines, i.e. the first
             # task within the job.
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "PENDING"',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}'),
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "PENDING"',
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
-            f'{_SPOT_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=f'{name}'),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        _JOB_CANCEL_WAIT.format(job_name=f'{name}'),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=30 * 60,
     )
     run_one_test(test)
@@ -2306,20 +2301,20 @@ def test_spot_pipeline(generic_cloud: str):
 @pytest.mark.no_scp  # SCP does not support spot instances
 @pytest.mark.no_paperspace  # Paperspace does not support spot instances
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_failed_setup(generic_cloud: str):
-    """Test managed spot job with failed setup."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_failed_setup(generic_cloud: str):
+    """Test managed job with failed setup."""
     name = _get_cluster_name()
     test = Test(
-        'spot_failed_setup',
+        'managed_jobs_failed_setup',
         [
-            f'sky spot launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
+            f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
             'sleep 330',
             # Make sure the job failed quickly.
-            f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            f'{_JOB_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        _JOB_CANCEL_WAIT.format(job_name=name),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
     run_one_test(test)
@@ -2332,51 +2327,51 @@ def test_spot_failed_setup(generic_cloud: str):
 @pytest.mark.no_scp  # SCP does not support spot instances
 @pytest.mark.no_paperspace  # Paperspace does not support spot instances
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_pipeline_failed_setup(generic_cloud: str):
-    """Test managed spot job with failed setup for a pipeline."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
+    """Test managed job with failed setup for a pipeline."""
     name = _get_cluster_name()
     test = Test(
-        'spot_pipeline_failed_setup',
+        'managed_jobs_pipeline_failed_setup',
         [
-            f'sky spot launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            'sleep 800',
+            f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
+            'sleep 600',
             # Make sure the job failed quickly.
-            f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            f'{_JOB_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
             # Task 0 should be SUCCEEDED.
-            f'{_SPOT_QUEUE_WAIT} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
+            f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 2p | grep "SUCCEEDED"',
             # Task 1 should be FAILED_SETUP.
-            f'{_SPOT_QUEUE_WAIT} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
+            f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 3p | grep "FAILED_SETUP"',
             # Task 2 should be CANCELLED.
-            f'{_SPOT_QUEUE_WAIT} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 4p | grep "CANCELLED"',
             # Task 3 should be CANCELLED.
-            f'{_SPOT_QUEUE_WAIT} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT} | grep -A 4 {name}| sed -n 5p | grep "CANCELLED"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        _JOB_CANCEL_WAIT.format(job_name=name),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=30 * 60,
     )
     run_one_test(test)
 
 
-# ---------- Testing managed spot recovery ----------
+# ---------- Testing managed job recovery ----------
 
 
 @pytest.mark.aws
-@pytest.mark.managed_spot
-def test_spot_recovery_aws(aws_config_region):
-    """Test managed spot recovery."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_aws(aws_config_region):
+    """Test managed job recovery."""
     name = _get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
     test = Test(
-        'spot_recovery_aws',
+        'managed_jobs_recovery_aws',
         [
-            f'sky spot launch --cloud aws --region {region} -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
             'sleep 360',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -2384,24 +2379,24 @@ def test_spot_recovery_aws(aws_config_region):
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
             'sleep 100',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.gcp
-@pytest.mark.managed_spot
-def test_spot_recovery_gcp():
-    """Test managed spot recovery."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_gcp():
+    """Test managed job recovery."""
     name = _get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     zone = 'us-east4-b'
     query_cmd = (
         f'gcloud compute instances list --filter='
@@ -2411,30 +2406,30 @@ def test_spot_recovery_gcp():
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
     test = Test(
-        'spot_recovery_gcp',
+        'managed_jobs_recovery_gcp',
         [
-            f'sky spot launch --cloud gcp --zone {zone} -n {name} --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
             'sleep 360',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             terminate_cmd,
             'sleep 60',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.aws
-@pytest.mark.managed_spot
-def test_spot_pipeline_recovery_aws(aws_config_region):
-    """Test managed spot recovery for a pipeline."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
+    """Test managed job recovery for a pipeline."""
     name = _get_cluster_name()
     user_hash = common_utils.get_user_hash()
     user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
@@ -2442,79 +2437,80 @@ def test_spot_pipeline_recovery_aws(aws_config_region):
     if region != 'us-east-2':
         pytest.skip('Only run spot pipeline recovery test in us-east-2')
     test = Test(
-        'spot_pipeline_recovery_aws',
+        'managed_jobs_pipeline_recovery_aws',
         [
-            f'sky spot launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
             'sleep 400',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky spot logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
             # The `cat ...| rev` is to retrieve the job_id from the
             # SKYPILOT_TASK_ID, which gets the second to last field
             # separated by `-`.
             (
-                f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
+                f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
                 'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
                 f'aws ec2 terminate-instances --region {region} --instance-ids $('
                 f'aws ec2 describe-instances --region {region} '
                 # TODO(zhwu): fix the name for spot cluster.
-                '--filters Name=tag:ray-cluster-name,Values=*-${SPOT_JOB_ID}'
+                '--filters Name=tag:ray-cluster-name,Values=*-${MANAGED_JOB_ID}'
                 f'-{user_hash} '
                 f'--query Reservations[].Instances[].InstanceId '
                 '--output text)'),
             'sleep 100',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky spot logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
             f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.gcp
-@pytest.mark.managed_spot
-def test_spot_pipeline_recovery_gcp():
-    """Test managed spot recovery for a pipeline."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_pipeline_recovery_gcp():
+    """Test managed job recovery for a pipeline."""
     name = _get_cluster_name()
     zone = 'us-east4-b'
     user_hash = common_utils.get_user_hash()
     user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
-    query_cmd = ('gcloud compute instances list --filter='
-                 f'"(labels.ray-cluster-name:*-${{SPOT_JOB_ID}}-{user_hash})" '
-                 f'--zones={zone} --format="value(name)"')
+    query_cmd = (
+        'gcloud compute instances list --filter='
+        f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" '
+        f'--zones={zone} --format="value(name)"')
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
     test = Test(
-        'spot_pipeline_recovery_gcp',
+        'managed_jobs_pipeline_recovery_gcp',
         [
-            f'sky spot launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
+            f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
             'sleep 400',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
-            f'RUN_IDS=$(sky spot logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
             # The `cat ...| rev` is to retrieve the job_id from the
             # SKYPILOT_TASK_ID, which gets the second to last field
             # separated by `-`.
-            (f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
+            (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | '
              f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
             'sleep 60',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 200',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
-            f'RUN_IDS=$(sky spot logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
+            f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
             f'cat /tmp/{name}-run-ids | sed -n 2p | grep `cat /tmp/{name}-run-id`',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
@@ -2527,38 +2523,38 @@ def test_spot_pipeline_recovery_gcp():
 @pytest.mark.no_scp  # SCP does not support spot instances
 @pytest.mark.no_paperspace  # Paperspace does not support spot instances
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_recovery_default_resources(generic_cloud: str):
-    """Test managed spot recovery for default resources."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_default_resources(generic_cloud: str):
+    """Test managed job recovery for default resources."""
     name = _get_cluster_name()
     test = Test(
         'managed-spot-recovery-default-resources',
         [
-            f'sky spot launch -n {name} --cloud {generic_cloud} "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
+            f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
             'sleep 360',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|RECOVERING"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.aws
-@pytest.mark.managed_spot
-def test_spot_recovery_multi_node_aws(aws_config_region):
-    """Test managed spot recovery."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
+    """Test managed job recovery."""
     name = _get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
     test = Test(
-        'spot_recovery_multi_node_aws',
+        'managed_jobs_recovery_multi_node_aws',
         [
-            f'sky spot launch --cloud aws --region {region} -n {name} --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
             'sleep 450',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -2567,24 +2563,24 @@ def test_spot_recovery_multi_node_aws(aws_config_region):
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
             'sleep 50',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 560',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=30 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.gcp
-@pytest.mark.managed_spot
-def test_spot_recovery_multi_node_gcp():
-    """Test managed spot recovery."""
+@pytest.mark.managed_jobs
+def test_managed_jobs_recovery_multi_node_gcp():
+    """Test managed job recovery."""
     name = _get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     zone = 'us-west2-a'
     # Use ':' to match as the cluster name will contain the suffix with job id
     query_cmd = (
@@ -2594,71 +2590,71 @@ def test_spot_recovery_multi_node_gcp():
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
     test = Test(
-        'spot_recovery_multi_node_gcp',
+        'managed_jobs_recovery_multi_node_gcp',
         [
-            f'sky spot launch --cloud gcp --zone {zone} -n {name} --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
             'sleep 400',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             terminate_cmd,
             'sleep 50',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
             'sleep 420',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
-            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
+            f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
+        _JOB_CANCEL_WAIT.format(job_name=name),
         timeout=25 * 60,
     )
     run_one_test(test)
 
 
 @pytest.mark.aws
-@pytest.mark.managed_spot
-def test_spot_cancellation_aws(aws_config_region):
+@pytest.mark.managed_jobs
+def test_managed_jobs_cancellation_aws(aws_config_region):
     name = _get_cluster_name()
     name_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     name_2_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-2', spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        f'{name}-2', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        f'{name}-3', spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        f'{name}-3', jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     region = aws_config_region
     test = Test(
-        'spot_cancellation_aws',
+        'managed_jobs_cancellation_aws',
         [
             # Test cancellation during spot cluster being launched.
-            f'sky spot launch --cloud aws --region {region} -n {name} "sleep 1000"  -y -d',
+            f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
             'sleep 60',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"',
-            _SPOT_CANCEL_WAIT.format(job_name=name),
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"',
+            _JOB_CANCEL_WAIT.format(job_name=name),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"',
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
              '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
             ),
             # Test cancelling the spot cluster during spot job being setup.
-            f'sky spot launch --cloud aws --region {region} -n {name}-2 tests/test_yamls/test_long_setup.yaml  -y -d',
+            f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             'sleep 300',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}-2'),
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}-2'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"',
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
              '--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
             ),
             # Test cancellation during spot job is recovering.
-            f'sky spot launch --cloud aws --region {region} -n {name}-3 "sleep 1000"  -y -d',
+            f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
             'sleep 300',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"',
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -2666,12 +2662,12 @@ def test_spot_cancellation_aws(aws_config_region):
              f'--query Reservations[].Instances[].InstanceId '
              '--output text)'),
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}-3'),
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"',
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}-3'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"',
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$(aws ec2 describe-instances --region {region} '
@@ -2685,12 +2681,12 @@ def test_spot_cancellation_aws(aws_config_region):
 
 
 @pytest.mark.gcp
-@pytest.mark.managed_spot
-def test_spot_cancellation_gcp():
+@pytest.mark.managed_jobs
+def test_managed_jobs_cancellation_gcp():
     name = _get_cluster_name()
     name_3 = f'{name}-3'
     name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
-        name_3, spot.SPOT_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
+        name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
     zone = 'us-west3-b'
     query_state_cmd = (
         'gcloud compute instances list '
@@ -2702,38 +2698,38 @@ def test_spot_cancellation_gcp():
     terminate_cmd = (f'gcloud compute instances delete --zone={zone}'
                      f' --quiet $({query_cmd})')
     test = Test(
-        'spot_cancellation_gcp',
+        'managed_jobs_cancellation_gcp',
         [
             # Test cancellation during spot cluster being launched.
-            f'sky spot launch --cloud gcp --zone {zone} -n {name} "sleep 1000"  -y -d',
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
             'sleep 60',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"',
-            _SPOT_CANCEL_WAIT.format(job_name=name),
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "STARTING"',
+            _JOB_CANCEL_WAIT.format(job_name=name),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "CANCELLED"',
             # Test cancelling the spot cluster during spot job being setup.
-            f'sky spot launch --cloud gcp --zone {zone} -n {name}-2 tests/test_yamls/test_long_setup.yaml  -y -d',
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
             'sleep 300',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}-2'),
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}-2'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-2 | head -n1 | grep "CANCELLED"',
             # Test cancellation during spot job is recovering.
-            f'sky spot launch --cloud gcp --zone {zone} -n {name}-3 "sleep 1000"  -y -d',
+            f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
             'sleep 300',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RUNNING"',
             # Terminate the cluster manually.
             terminate_cmd,
             'sleep 80',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"',
-            _SPOT_CANCEL_WAIT.format(job_name=f'{name}-3'),
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "RECOVERING"',
+            _JOB_CANCEL_WAIT.format(job_name=f'{name}-3'),
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
             'sleep 120',
-            f'{_SPOT_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name}-3 | head -n1 | grep "CANCELLED"',
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
@@ -2743,7 +2739,7 @@ def test_spot_cancellation_gcp():
     run_one_test(test)
 
 
-# ---------- Testing storage for managed spot ----------
+# ---------- Testing storage for managed job ----------
 @pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
 @pytest.mark.no_azure  # Azure does not support spot instances
 @pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
@@ -2751,16 +2747,16 @@ def test_spot_cancellation_gcp():
 @pytest.mark.no_paperspace  # Paperspace does not support spot instances
 @pytest.mark.no_scp  # SCP does not support spot instances
 @pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_storage(generic_cloud: str):
-    """Test storage with managed spot"""
+@pytest.mark.managed_jobs
+def test_managed_jobs_storage(generic_cloud: str):
+    """Test storage with managed job"""
     name = _get_cluster_name()
     yaml_str = pathlib.Path(
-        'examples/managed_spot_with_storage.yaml').read_text()
+        'examples/managed_job_with_storage.yaml').read_text()
     storage_name = f'sky-test-{int(time.time())}'
 
     # Also perform region testing for bucket creation to validate if buckets are
-    # created in the correct region and correctly mounted in spot jobs.
+    # created in the correct region and correctly mounted in managed jobs.
     # However, we inject this testing only for AWS and GCP since they are the
     # supported object storage providers in SkyPilot.
     region_flag = ''
@@ -2784,17 +2780,17 @@ def test_spot_storage(generic_cloud: str):
         f.flush()
         file_path = f.name
         test = Test(
-            'spot_storage',
+            'managed_jobs_storage',
             [
                 *storage_setup_commands,
-                f'sky spot launch -n {name} --cloud {generic_cloud}{region_flag} {file_path} -y',
+                f'sky jobs launch -n {name} --use-spot --cloud {generic_cloud}{region_flag} {file_path} -y',
                 region_validation_cmd,  # Check if the bucket is created in the correct region
                 'sleep 60',  # Wait the spot queue to be updated
-                f'{_SPOT_QUEUE_WAIT}| grep {name} | grep SUCCEEDED',
+                f'{_JOB_QUEUE_WAIT}| grep {name} | grep SUCCEEDED',
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]'
             ],
-            _SPOT_CANCEL_WAIT.format(job_name=name),
-            # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+            _JOB_CANCEL_WAIT.format(job_name=name),
+            # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
             timeout=20 * 60,
         )
         run_one_test(test)
@@ -2802,48 +2798,41 @@ def test_spot_storage(generic_cloud: str):
 
 # ---------- Testing spot TPU ----------
 @pytest.mark.gcp
-@pytest.mark.managed_spot
+@pytest.mark.managed_jobs
 @pytest.mark.tpu
-def test_spot_tpu():
-    """Test managed spot on TPU."""
+def test_managed_jobs_tpu():
+    """Test managed job on TPU."""
     name = _get_cluster_name()
     test = Test(
         'test-spot-tpu',
         [
-            f'sky spot launch -n {name} examples/tpu/tpuvm_mnist.yaml -y -d',
+            f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
             'sleep 5',
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep STARTING',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep STARTING',
             'sleep 900',  # TPU takes a while to launch
-            f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"',
+            f'{_JOB_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        _JOB_CANCEL_WAIT.format(job_name=name),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
     run_one_test(test)
 
 
-# ---------- Testing env for spot ----------
-@pytest.mark.no_fluidstack  # Fluidstack does not support spot instances
-@pytest.mark.no_azure  # Azure does not support spot instances
-@pytest.mark.no_lambda_cloud  # Lambda Cloud does not support spot instances
-@pytest.mark.no_ibm  # IBM Cloud does not support spot instances
-@pytest.mark.no_scp  # SCP does not support spot instances
-@pytest.mark.no_paperspace  # Paperspace does not support spot instances
-@pytest.mark.no_kubernetes  # Kubernetes does not have a notion of spot instances
-@pytest.mark.managed_spot
-def test_spot_inline_env(generic_cloud: str):
-    """Test spot env"""
+# ---------- Testing env for managed jobs ----------
+@pytest.mark.managed_jobs
+def test_managed_jobs_inline_env(generic_cloud: str):
+    """Test managed jobs env"""
     name = _get_cluster_name()
     test = Test(
-        'test-spot-inline-env',
+        'test-managed-jobs-inline-env',
         [
-            f'sky spot launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
+            f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
             'sleep 20',
-            f'{_SPOT_QUEUE_WAIT} | grep {name} | grep SUCCEEDED',
+            f'{_JOB_QUEUE_WAIT} | grep {name} | grep SUCCEEDED',
         ],
-        _SPOT_CANCEL_WAIT.format(job_name=name),
-        # Increase timeout since sky spot queue -r can be blocked by other spot tests.
+        _JOB_CANCEL_WAIT.format(job_name=name),
+        # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
         timeout=20 * 60,
     )
     run_one_test(test)
@@ -2905,10 +2894,10 @@ def test_aws_custom_image():
 
 @pytest.mark.kubernetes
 @pytest.mark.parametrize(
-    "image_id",
+    'image_id',
     [
-        "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
-        "docker:ubuntu:18.04",
+        'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04',
+        'docker:ubuntu:18.04',
         # Test latest image with python 3.11 installed by default.
         # Does not work for python 3.12 due to ray's requirement for 3.11.
         'docker:continuumio/miniconda3:24.1.2-0',
@@ -3300,11 +3289,11 @@ def test_skyserve_spot_recovery():
             f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml',
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl -L http://$endpoint | grep "Hi, SkyPilot here"',
+            'request_output=$(curl -L http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
             _terminate_gcp_replica(name, zone, 1),
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl -L http://$endpoint | grep "Hi, SkyPilot here"',
+            'request_output=$(curl -L http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
         ],
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
@@ -3448,7 +3437,7 @@ def test_skyserve_auto_restart():
             f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml',
             _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl -L http://$endpoint | grep "Hi, SkyPilot here"',
+            'request_output=$(curl -L http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
             # sleep for 20 seconds (initial delay) to make sure it will
             # be restarted
             f'sleep 20',
@@ -3468,7 +3457,7 @@ def test_skyserve_auto_restart():
             '     sleep 10;'
             f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};',
             f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
-            'curl -L http://$endpoint | grep "Hi, SkyPilot here"',
+            'request_output=$(curl -L http://$endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
         ],
         _TEARDOWN_SERVICE.format(name=name),
         timeout=20 * 60,
@@ -4655,7 +4644,7 @@ def test_externally_created_bucket_mount_without_source(
     ])
     def test_aws_regions(self, tmp_local_storage_obj, region):
         # This tests creation and upload to bucket in all AWS s3 regions
-        # To test full functionality, use test_spot_storage above.
+        # To test full functionality, use test_managed_jobs_storage above.
         store_type = storage_lib.StoreType.S3
         tmp_local_storage_obj.add_store(store_type, region=region)
         bucket_name = tmp_local_storage_obj.name
@@ -4693,7 +4682,7 @@ def test_aws_regions(self, tmp_local_storage_obj, region):
     ])
     def test_gcs_regions(self, tmp_local_storage_obj, region):
         # This tests creation and upload to bucket in all GCS regions
-        # To test full functionality, use test_spot_storage above.
+        # To test full functionality, use test_managed_jobs_storage above.
         store_type = storage_lib.StoreType.GCS
         tmp_local_storage_obj.add_store(store_type, region=region)
         bucket_name = tmp_local_storage_obj.name
@@ -4722,7 +4711,7 @@ class TestYamlSpecs:
     #  We should not use `examples/storage_demo.yaml` here, since it requires
     #  users to ensure bucket names to not exist and/or be unique.
     _TEST_YAML_PATHS = [
-        'examples/minimal.yaml', 'examples/managed_spot.yaml',
+        'examples/minimal.yaml', 'examples/managed_job.yaml',
         'examples/using_file_mounts.yaml', 'examples/resnet_app.yaml',
         'examples/multi_hostname.yaml'
     ]
diff --git a/tests/test_yamls/failed_setup_pipeline.yaml b/tests/test_yamls/failed_setup_pipeline.yaml
index fe4ef7746d4..fb3ef62ad44 100644
--- a/tests/test_yamls/failed_setup_pipeline.yaml
+++ b/tests/test_yamls/failed_setup_pipeline.yaml
@@ -5,6 +5,13 @@ name: a
 
 num_nodes: 2
 
+resources:
+  cpus: 2
+  any_of:
+    - cloud: aws
+    - cloud: gcp
+    
+
 setup: |
   echo setup for train
 
@@ -15,6 +22,13 @@ run: |
 ---
 name: b
 
+resources:
+  cpus: 2
+  any_of:
+    - cloud: aws
+    - cloud: gcp
+    
+
 setup: |
   exit 1
 
diff --git a/tests/test_yamls/pipeline.yaml b/tests/test_yamls/pipeline.yaml
index e4cab670040..4339406c833 100644
--- a/tests/test_yamls/pipeline.yaml
+++ b/tests/test_yamls/pipeline.yaml
@@ -5,6 +5,10 @@ name: a
 
 resources:
   cpus: 2+
+  use_spot: true
+  any_of:
+    - cloud: aws
+    - cloud: gcp
 
 setup: |
   echo setup for train
@@ -20,6 +24,9 @@ name: b
 
 resources:
   cpus: 2+
+  any_of:
+    - cloud: aws
+    - cloud: gcp
 
 setup: |
   echo setup for train
diff --git a/tests/unit_tests/test_adaptor.py b/tests/unit_tests/test_adaptor.py
index 7b586d73320..ac7e17727c1 100644
--- a/tests/unit_tests/test_adaptor.py
+++ b/tests/unit_tests/test_adaptor.py
@@ -1,6 +1,6 @@
 """Tests the inner loop of repeatedly querying instance status.
 
-Used by spot controller. This test prevents #2668 from regressing.
+Used by jobs controller. This test prevents #2668 from regressing.
 """
 
 import math
diff --git a/tests/unit_tests/test_controller_utils.py b/tests/unit_tests/test_controller_utils.py
index 8ba0760ccd7..7465f648385 100644
--- a/tests/unit_tests/test_controller_utils.py
+++ b/tests/unit_tests/test_controller_utils.py
@@ -4,19 +4,19 @@
 import pytest
 
 import sky
+from sky.jobs import constants as managed_job_constants
 from sky.serve import constants as serve_constants
-from sky.spot import constants as spot_constants
 from sky.utils import controller_utils
 
 
 @pytest.mark.parametrize(
     ('controller_type', 'custom_controller_resources_config', 'expected'), [
-        ('spot', {}, {
+        ('jobs', {}, {
             'cpus': '8+',
             'memory': '3x',
             'disk_size': 50
         }),
-        ('spot', {
+        ('jobs', {
             'cpus': '4+',
             'disk_size': 100
         }, {
@@ -56,7 +56,8 @@ def get_custom_controller_resources(keys, default):
 
     controller_resources = list(
         controller_utils.get_controller_resources(
-            controller_type=controller_type, task_resources=[]))[0]
+            controller=controller_utils.Controllers.from_type(controller_type),
+            task_resources=[]))[0]
     controller_resources_config = controller_resources.to_yaml_config()
     for k, v in expected.items():
         assert controller_resources_config[k] == v, (
@@ -65,7 +66,7 @@ def get_custom_controller_resources(keys, default):
 
 
 @pytest.mark.parametrize(('controller_type', 'default_controller_resources'), [
-    ('spot', spot_constants.CONTROLLER_RESOURCES),
+    ('jobs', managed_job_constants.CONTROLLER_RESOURCES),
     ('serve', serve_constants.CONTROLLER_RESOURCES),
 ])
 def test_get_controller_resources_with_task_resources(
@@ -80,7 +81,7 @@ def test_get_controller_resources_with_task_resources(
     all_clouds = {sky.AWS(), sky.GCP(), sky.Azure()}
     all_cloud_names = {str(c) for c in all_clouds}
     controller_resources = controller_utils.get_controller_resources(
-        controller_type=controller_type,
+        controller=controller_utils.Controllers.from_type(controller_type),
         task_resources=[sky.Resources(cloud=c) for c in all_clouds])
     for r in controller_resources:
         config = r.to_yaml_config()
@@ -116,7 +117,7 @@ def _could_host_controllers(cloud: sky.clouds.Cloud) -> bool:
         str(c) for c in all_clouds if _could_host_controllers(c)
     }
     controller_resources = controller_utils.get_controller_resources(
-        controller_type=controller_type,
+        controller=controller_utils.Controllers.from_type(controller_type),
         task_resources=[sky.Resources(cloud=c) for c in all_clouds])
     for r in controller_resources:
         config = r.to_yaml_config()
@@ -129,7 +130,7 @@ def _could_host_controllers(cloud: sky.clouds.Cloud) -> bool:
     # 3. Some resources does not have cloud specified.
     # Return the default resources.
     controller_resources = controller_utils.get_controller_resources(
-        controller_type=controller_type,
+        controller=controller_utils.Controllers.from_type(controller_type),
         task_resources=[
             sky.Resources(accelerators='L4'),
             sky.Resources(cloud=sky.RunPod(), accelerators='A40'),