Skip to content

Commit

Permalink
Update checkpoint format (#619)
Browse files Browse the repository at this point in the history
* update upload_training_files format

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update test api

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* update checkpoint

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* update checkpoint response

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* add defi

* add def

* update output_dir

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* update checkoutPoint definition

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* map CHECKPOINT_ID and Model_PATH

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* delete test code

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* Update api_protocol.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* delete runtime_env

* update FineTuningJobCheckpoint format

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update step_number to optional

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* Update the checkpoint return format

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
WenjiaoYue and pre-commit-ci[bot] authored Sep 5, 2024
1 parent 7d2cd6b commit 8369fbf
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 5 deletions.
42 changes: 42 additions & 0 deletions comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,3 +797,45 @@ class FileObject(BaseModel):
Supported values are assistants, assistants_output, batch, batch_output, fine-tune, fine-tune-results and vision.
"""


class Metrics(BaseModel):
full_valid_loss: Optional[float] = None

full_valid_mean_token_accuracy: Optional[float] = None

step: Optional[float] = None

train_loss: Optional[float] = None

train_mean_token_accuracy: Optional[float] = None

valid_loss: Optional[float] = None

valid_mean_token_accuracy: Optional[float] = None


class FineTuningJobCheckpoint(BaseModel):
id: str
"""The checkpoint identifier, which can be referenced in the API endpoints."""

created_at: int
"""The Unix timestamp (in seconds) for when the checkpoint was created."""

fine_tuned_model_checkpoint: str
"""The name of the fine-tuned checkpoint model that is created."""

fine_tuning_job_id: str
"""The name of the fine-tuning job that this checkpoint was created from."""

fine_tuning_job_id: str
"""The name of the fine-tuning job that this checkpoint was created from."""

metrics: Optional[Metrics] = None
"""Metrics at the step number during the fine-tuning job."""

object: Literal["fine_tuning.job.checkpoint"]
"""The object type, which is always "fine_tuning.job.checkpoint"."""

step_number: Optional[int] = None
"""The step number that the checkpoint was created at."""
2 changes: 1 addition & 1 deletion comps/finetuning/finetuning_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ async def upload_training_files(request: UploadFileRequest = Depends(upload_file
)
def list_checkpoints(request: FineTuningJobIDRequest):
checkpoints = handle_list_finetuning_checkpoints(request)
return {"status": 200, "checkpoints": str(checkpoints)}
return checkpoints


if __name__ == "__main__":
Expand Down
22 changes: 18 additions & 4 deletions comps/finetuning/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from comps.cores.proto.api_protocol import (
FileObject,
FineTuningJob,
FineTuningJobCheckpoint,
FineTuningJobIDRequest,
FineTuningJobList,
FineTuningJobsRequest,
Expand All @@ -38,13 +39,17 @@
os.mkdir(OUTPUT_DIR)

FineTuningJobID = str
CheckpointID = str
CheckpointPath = str

CHECK_JOB_STATUS_INTERVAL = 5 # Check every 5 secs

global ray_client
ray_client: JobSubmissionClient = None

running_finetuning_jobs: Dict[FineTuningJobID, FineTuningJob] = {}
finetuning_job_to_ray_job: Dict[FineTuningJobID, str] = {}
checkpoint_id_to_checkpoint_path: Dict[CheckpointID, CheckpointPath] = {}


# Add a background task to periodicly update job status
Expand Down Expand Up @@ -117,8 +122,6 @@ def handle_create_finetuning_jobs(request: FineTuningParams, background_tasks: B
ray_job_id = ray_client.submit_job(
# Entrypoint shell command to execute
entrypoint=f"python finetune_runner.py --config_file {finetune_config_file}",
# Path to the local directory that contains the script.py file
runtime_env={"working_dir": "./", "excludes": [f"{OUTPUT_DIR}"]},
)

logger.info(f"Submitted Ray job: {ray_job_id} ...")
Expand Down Expand Up @@ -183,10 +186,21 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
job = running_finetuning_jobs.get(fine_tuning_job_id)
if job is None:
raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
output_dir = os.path.join(JOBS_PATH, job.id)
output_dir = os.path.join(OUTPUT_DIR, job.id)
checkpoints = []
if os.path.exists(output_dir):
checkpoints = os.listdir(output_dir)
# Iterate over the contents of the directory and add an entry for each
for _ in os.listdir(output_dir): # Loop over directory contents
checkpointsResponse = FineTuningJobCheckpoint(
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
created_at=int(time.time()), # Use the current timestamp
fine_tuned_model_checkpoint=output_dir, # Directory path itself
fine_tuning_job_id=fine_tuning_job_id,
object="fine_tuning.job.checkpoint",
)
checkpoints.append(checkpointsResponse)
checkpoint_id_to_checkpoint_path[checkpointsResponse.id] = checkpointsResponse.fine_tuned_model_checkpoint

return checkpoints


Expand Down

0 comments on commit 8369fbf

Please sign in to comment.