Skip to content

Commit

Permalink
Backport: Configurable status when dependency times out tony-framewor…
Browse files Browse the repository at this point in the history
  • Loading branch information
zuston committed Feb 9, 2022
1 parent 9869f59 commit ccee835
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ public static String getContainerDockerMountKey() {
public static final String GROUP_REGEX = TONY_APPLICATION_PREFIX + "group\\.([A-Za-z]+)$";
public static final String GROUP_DEPEND_TIMEOUT_REGEX =
TONY_APPLICATION_PREFIX + "dependency\\.([A-Za-z]+)\\.timeout\\.after\\.([A-Za-z]+)$";
public static final String GROUP_DEPEND_TIMEOUT_IGNORE_REGEX =
TONY_APPLICATION_PREFIX + "dependency\\.([A-Za-z]+)\\.timeout\\.after\\.([A-Za-z]+)$.ignored";

public static String getGroupKey(String groupName) {
return String.format(TONY_APPLICATION_PREFIX + "group.%s", groupName);
Expand All @@ -353,4 +355,8 @@ public static String getGroupKey(String groupName) {
public static String getGroupDependentKey(String grp, String dependentGrp) {
return String.format(TONY_APPLICATION_PREFIX + "dependency.%s.timeout.after.%s", grp, dependentGrp);
}

public static String getGroupDependentIgnoredKey(String roleType, String dependentGrp) {
return String.format(TONY_APPLICATION_PREFIX + "dependency.%s.timeout.after.%s.ignored", roleType, dependentGrp);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import com.linkedin.tony.util.Utils;

import static com.linkedin.tony.Constants.SIDECAR_TB_ROLE_NAME;
import static com.linkedin.tony.TonyConfigurationKeys.getGroupDependentIgnoredKey;

public abstract class MLGenericRuntime implements FrameworkRuntime {
private static final long REGISTRATION_STATUS_INTERVAL_MS = 15 * 1000;
Expand Down Expand Up @@ -137,9 +138,16 @@ public boolean isHealthy(Configuration tonyConf) {
* chief/workers are finished, the mechanism of dependency group timeout will make job failed.
*
* Dependency group timeout configuration as follows:
*
* ```
* tony.application.group.A = worker,chief
* tony.application.dependency.evaluator.timeout.after.A = 3600
* ```
*
* And in some of the cases, we don't want to fail the whole job even though a dependency times out.
* For example, if chief succeeded and there is a worker hanging for 1 hour,
* users can configure the job to still pass. So it introduces the new config of
* `tony.application.dependency.[X].timeout.after.[GROUP].ignored = true`, and more details could be
* found in https://github.com/tony-framework/TonY/issues/641.
*
*/
String errorMsg = null;
Expand Down Expand Up @@ -231,10 +239,19 @@ protected String groupDependencyTimeout(Configuration tonyConf) {
log.info("Running job type: " + runningTaskType + ", all dependent task finished: " + allDependentTaskFinished);

if (System.currentTimeMillis() - latestEndTimeInAllDependentTasks > timeout) {
return String.format("Jobtype: %s runs exceeded timeout(%s sec) because it's "
+ "dependent group: %s (task set: [%s]) has been finished",
runningTaskType, dependentGroupPair.getValue(), dependentGroupName,
StringUtils.join(grpWithMembersIndex.get(dependentGroupName), ","));
String ignoredTaskTypeKey = getGroupDependentIgnoredKey(runningTaskType, dependentGroupName);
boolean ignoreTimeout = tonyConf.getBoolean(ignoredTaskTypeKey, false);
if (!ignoreTimeout) {
return String.format("Task type: %s runs exceeded timeout because it's "
+ "dependent group: %s (task set: [%s]) has been finished.",
runningTaskType, dependentGroupName,
StringUtils.join(grpWithMembersIndex.get(dependentGroupName), ","));
}

log.info(
String.format("Task type: %s is marked as untracked.", runningJobTypes)
);
session.makeTaskTypeUntracked(runningTaskType);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
Expand All @@ -39,6 +40,8 @@

import static com.linkedin.tony.Constants.CHIEF_JOB_NAME;
import static com.linkedin.tony.Constants.WORKER_JOB_NAME;
import static com.linkedin.tony.TonyConfigurationKeys.UNTRACKED_JOBTYPES;
import static com.linkedin.tony.util.Utils.getUntrackedJobTypes;


/**
Expand Down Expand Up @@ -639,4 +642,11 @@ public int getNumRegisteredTasks() {
public Set<String> getRegisteredTasks() {
return registeredTasks;
}

public void makeTaskTypeUntracked(String taskType) {
String[] defaultUntrackedTypes = getUntrackedJobTypes(tonyConf);
List<String> untrackedList = Arrays.stream(defaultUntrackedTypes).collect(Collectors.toList());
untrackedList.add(taskType);
tonyConf.set(UNTRACKED_JOBTYPES, StringUtils.join(untrackedList, ","));
}
}

0 comments on commit ccee835

Please sign in to comment.