Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Add api endpoint and webportal page of job retry history (#3831)
Browse files Browse the repository at this point in the history
  • Loading branch information
debuggy authored Nov 20, 2019
1 parent bbb9816 commit 15ed057
Show file tree
Hide file tree
Showing 24 changed files with 1,859 additions and 66 deletions.
166 changes: 166 additions & 0 deletions docs/rest-server/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -2889,6 +2889,172 @@ Status: 500
}
```

### `GET /api/v2/jobs/:frameworkName/jobAttempts/healthz`

Check if jobAttempts is healthy

*Request*

```json
GET /api/v2/jobs/:frameworkName/jobAttempts/healthz
```

*Response if succeeded*

```json
Status: 200
OK
```

*Response if job attempts API not work*

```json
Status: 501
Not healthy
```

### `GET /api/v2/jobs/:frameworkName/jobAttempts`

Get all attempts of a certain job.

*Request*

```json
GET /api/v2/jobs/:frameworkName/jobAttempts
```

*Response if succeeded*

```json
Status: 200

[
{
"jobName": string,
"frameworkName": string,
"userName": string,
"state": "FAILED",
"originState": "Completed",
"maxAttemptCount": 4,
"attemptIndex": 3,
"jobStartedTime": 1572592684000,
"attemptStartedTime": 1572592813000,
"attemptCompletedTime": 1572592840000,
"exitCode": 255,
"exitPhrase": "PAIRuntimeUnknownFailed",
"exitType": "Failed",
"diagnosticsSummary": string,
"totalGpuNumber": 1,
"totalTaskNumber": 1,
"totalTaskRoleNumber": 1,
"taskRoles": {
"taskrole": {
"taskRoleStatus": {
"name": "taskrole"
},
"taskStatuses": [
{
"taskIndex": 0,
"taskState": "FAILED",
"containerId": uuid string,
"containerIp": ip string,
"containerGpus": null,
"containerLog": url string,
"containerExitCode": 255
}
]
}
},
"isLatest": true
},
]
```

*Response if attempts not found*

```json
Status: 404

Not Found
```

*Response if a server error occurred*

```json
Status: 501

Internal Error
```
### `GET /api/v2/jobs/:frameworkName/jobAttempts/:attemptIndex`

Get a specific attempt by attempt index.

*Request*

```json
GET /api/v2/jobs/:frameworkName/jobAttempts/:attemptIndex
```

*Response if succeeded*

```json
Status: 200

{
"jobName": string,
"frameworkName": string,
"userName": string,
"state": "FAILED",
"originState": "Completed",
"maxAttemptCount": 4,
"attemptIndex": 3,
"jobStartedTime": 1572592684000,
"attemptStartedTime": 1572592813000,
"attemptCompletedTime": 1572592840000,
"exitCode": 255,
"exitPhrase": "PAIRuntimeUnknownFailed",
"exitType": "Failed",
"diagnosticsSummary": string,
"totalGpuNumber": 1,
"totalTaskNumber": 1,
"totalTaskRoleNumber": 1,
"taskRoles": {
"taskrole": {
"taskRoleStatus": {
"name": "taskrole"
},
"taskStatuses": [
{
"taskIndex": 0,
"taskState": "FAILED",
"containerId": uuid string,
"containerIp": ip string,
"containerGpus": null,
"containerLog": url string,
"containerExitCode": 255
}
]
}
},
"isLatest": true
},
```

*Response if attempts not found*

```json
Status: 404

Not Found
```

*Response if a server error occurred*

```json
Status: 501

Internal Error
```
## About legacy jobs

Since [Framework ACL](../../subprojects/frameworklauncher/yarn/doc/USERMANUAL.md#Framework_ACL) is enabled since this version,
Expand Down
1 change: 1 addition & 0 deletions src/rest-server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"node": "^8.9.0"
},
"dependencies": {
"@elastic/elasticsearch": "^7.4.0",
"ajv": "^6.10.0",
"ajv-merge-patch": "~4.1.0",
"async": "~2.5.0",
Expand Down
45 changes: 45 additions & 0 deletions src/rest-server/src/controllers/v2/job-attempt.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

// module dependencies
const asyncHandler = require('@pai/middlewares/v2/asyncHandler');
const jobAttempt = require('@pai/models/v2/job-attempt.js');

const healthCheck = asyncHandler(async (req, res) => {
const isHealthy = await jobAttempt.healthCheck();
if (!isHealthy) {
res.status(501).send('Not healthy');
} else {
res.status(200).send('ok');
}
});

const list = asyncHandler(async (req, res) => {
const result = await jobAttempt.list(req.params.frameworkName);
res.status(result.status).json(result.data);
});

const get = asyncHandler(async (req, res) => {
const result = await jobAttempt.get(req.params.frameworkName, Number(req.params.jobAttemptIndex));
res.status(result.status).json(result.data);
});

module.exports = {
healthCheck,
list,
get,
};
Loading

0 comments on commit 15ed057

Please sign in to comment.