-
Notifications
You must be signed in to change notification settings - Fork 15k
KAFKA-15022: [3/N] use graph to compute rack aware assignment for active stateful tasks #14030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d9f6fc7
7b6d181
8d412ee
04e9712
77e7e6f
4976247
e901d35
296c70b
d0e2f20
acc5ab6
a553afe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,12 +16,17 @@ | |
| */ | ||
| package org.apache.kafka.streams.processor.internals.assignment; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Map.Entry; | ||
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.SortedMap; | ||
| import java.util.SortedSet; | ||
| import java.util.UUID; | ||
| import org.apache.kafka.common.Cluster; | ||
| import org.apache.kafka.common.Node; | ||
|
|
@@ -39,28 +44,30 @@ | |
| public class RackAwareTaskAssignor { | ||
| private static final Logger log = LoggerFactory.getLogger(RackAwareTaskAssignor.class); | ||
|
|
||
| private static final int SOURCE_ID = -1; | ||
|
|
||
| private final Cluster fullMetadata; | ||
| private final Map<TaskId, Set<TopicPartition>> partitionsForTask; | ||
| private final Map<UUID, Map<String, Optional<String>>> processRacks; | ||
| private final Map<UUID, Map<String, Optional<String>>> racksForProcess; | ||
| private final AssignmentConfigs assignmentConfigs; | ||
| private final Map<TopicPartition, Set<String>> racksForPartition; | ||
| private final InternalTopicManager internalTopicManager; | ||
|
|
||
| public RackAwareTaskAssignor(final Cluster fullMetadata, | ||
| final Map<TaskId, Set<TopicPartition>> partitionsForTask, | ||
| final Map<Subtopology, Set<TaskId>> tasksForTopicGroup, | ||
| final Map<UUID, Map<String, Optional<String>>> processRacks, | ||
| final Map<UUID, Map<String, Optional<String>>> racksForProcess, | ||
| final InternalTopicManager internalTopicManager, | ||
| final AssignmentConfigs assignmentConfigs) { | ||
| this.fullMetadata = fullMetadata; | ||
| this.partitionsForTask = partitionsForTask; | ||
| this.processRacks = processRacks; | ||
| this.racksForProcess = racksForProcess; | ||
| this.internalTopicManager = internalTopicManager; | ||
| this.assignmentConfigs = assignmentConfigs; | ||
| this.racksForPartition = new HashMap<>(); | ||
| } | ||
|
|
||
| public synchronized boolean canEnableRackAwareAssignorForActiveTasks() { | ||
| public synchronized boolean canEnableRackAwareAssignor() { | ||
| /* | ||
| TODO: enable this after we add the config | ||
| if (StreamsConfig.RACK_AWARE_ASSSIGNMENT_STRATEGY_NONE.equals(assignmentConfigs.rackAwareAssignmentStrategy)) { | ||
|
|
@@ -74,11 +81,7 @@ public synchronized boolean canEnableRackAwareAssignorForActiveTasks() { | |
| } | ||
|
|
||
| return validateTopicPartitionRack(); | ||
| } | ||
|
|
||
| public boolean canEnableRackAwareAssignorForStandbyTasks() { | ||
| // TODO | ||
| return false; | ||
| // TODO: add changelog topic, standby task validation | ||
| } | ||
|
|
||
| // Visible for testing. This method also checks if all TopicPartitions exist in cluster | ||
|
|
@@ -159,7 +162,7 @@ public boolean validateClientRack() { | |
| * 1. RackId exist for all clients | ||
| * 2. Different consumerId for same process should have same rackId | ||
| */ | ||
| for (final Map.Entry<UUID, Map<String, Optional<String>>> entry : processRacks.entrySet()) { | ||
| for (final Map.Entry<UUID, Map<String, Optional<String>>> entry : racksForProcess.entrySet()) { | ||
| final UUID processId = entry.getKey(); | ||
| KeyValue<String, String> previousRackInfo = null; | ||
| for (final Map.Entry<String, Optional<String>> rackEntry : entry.getValue().entrySet()) { | ||
|
|
@@ -185,4 +188,213 @@ public boolean validateClientRack() { | |
| } | ||
| return true; | ||
| } | ||
|
|
||
| private int getCost(final TaskId taskId, final UUID processId, final boolean inCurrentAssignment, final int trafficCost, final int nonOverlapCost) { | ||
| final Map<String, Optional<String>> clientRacks = racksForProcess.get(processId); | ||
| if (clientRacks == null) { | ||
| throw new IllegalStateException("Client " + processId + " doesn't exist in processRacks"); | ||
| } | ||
| final Optional<Optional<String>> clientRackOpt = clientRacks.values().stream().filter(Optional::isPresent).findFirst(); | ||
| if (!clientRackOpt.isPresent() || !clientRackOpt.get().isPresent()) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. This is to mute some warning in Intellij, Checkstyle or spotBugs |
||
| throw new IllegalStateException("Client " + processId + " doesn't have rack configured. Maybe forgot to call canEnableRackAwareAssignor first"); | ||
| } | ||
|
|
||
| final String clientRack = clientRackOpt.get().get(); | ||
| final Set<TopicPartition> topicPartitions = partitionsForTask.get(taskId); | ||
| if (topicPartitions == null || topicPartitions.isEmpty()) { | ||
| throw new IllegalStateException("Task " + taskId + " has no TopicPartitions"); | ||
| } | ||
|
|
||
| int cost = 0; | ||
| for (final TopicPartition tp : topicPartitions) { | ||
| final Set<String> tpRacks = racksForPartition.get(tp); | ||
| if (tpRacks == null || tpRacks.isEmpty()) { | ||
| throw new IllegalStateException("TopicPartition " + tp + " has no rack information. Maybe forgot to call canEnableRackAwareAssignor first"); | ||
| } | ||
| if (!tpRacks.contains(clientRack)) { | ||
| cost += trafficCost; | ||
| } | ||
| } | ||
|
|
||
| if (!inCurrentAssignment) { | ||
| cost += nonOverlapCost; | ||
|
||
| } | ||
|
|
||
| return cost; | ||
| } | ||
|
|
||
| private static int getSinkID(final List<UUID> clientList, final List<TaskId> taskIdList) { | ||
| return clientList.size() + taskIdList.size(); | ||
| } | ||
|
|
||
| // For testing. canEnableRackAwareAssignor must be called first | ||
| long activeTasksCost(final SortedMap<UUID, ClientState> clientStates, final SortedSet<TaskId> activeTasks, final int trafficCost, final int nonOverlapCost) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add JavaDocs? It's a little unclear what this method does. Also maybe move |
||
| final List<UUID> clientList = new ArrayList<>(clientStates.keySet()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems Looking into
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. List is trying to make it deterministic. |
||
| final List<TaskId> taskIdList = new ArrayList<>(activeTasks); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question as for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also to make it deterministic. |
||
| final Graph<Integer> graph = constructActiveTaskGraph(activeTasks, clientList, taskIdList, | ||
| clientStates, new HashMap<>(), new HashMap<>(), trafficCost, nonOverlapCost); | ||
| return graph.totalCost(); | ||
| } | ||
|
|
||
| /** | ||
| * Optimize active task assignment for rack awareness. canEnableRackAwareAssignor must be called first. | ||
| * {@code trafficCost} and {@code nonOverlapCost} balance cross rack traffic optimization and task movement. | ||
| * If we set {@code trafficCost} to a larger number, we are more likely to compute an assignment with less | ||
| * cross rack traffic. However, tasks may be shuffled a lot across clients. If we set {@code nonOverlapCost} | ||
| * to a larger number, we are more likely to compute an assignment with similar to input assignment. However, | ||
| * cross rack traffic can be higher. In extreme case, if we set {@code nonOverlapCost} to 0 and @{code trafficCost} | ||
| * to a positive value, the computed assignment will be minimum for cross rack traffic. If we set {@code trafficCost} to 0, | ||
| * and {@code nonOverlapCost} to a positive value, the computed assignment should be the same as input | ||
| * @param clientStates Client states | ||
| * @param activeTasks Tasks to reassign if needed. They must be assigned already in clientStates | ||
| * @param trafficCost Cost of cross rack traffic for each TopicPartition | ||
| * @param nonOverlapCost Cost of assign a task to a different client | ||
| * @return Total cost after optimization | ||
| */ | ||
| public long optimizeActiveTasks(final SortedMap<UUID, ClientState> clientStates, | ||
| final SortedSet<TaskId> activeTasks, | ||
| final int trafficCost, | ||
| final int nonOverlapCost) { | ||
| if (activeTasks.isEmpty()) { | ||
| return 0; | ||
| } | ||
|
|
||
| final List<UUID> clientList = new ArrayList<>(clientStates.keySet()); | ||
|
||
| final List<TaskId> taskIdList = new ArrayList<>(activeTasks); | ||
| final Map<TaskId, UUID> taskClientMap = new HashMap<>(); | ||
| final Map<UUID, Integer> originalAssignedTaskNumber = new HashMap<>(); | ||
| final Graph<Integer> graph = constructActiveTaskGraph(activeTasks, clientList, taskIdList, | ||
| clientStates, taskClientMap, originalAssignedTaskNumber, trafficCost, nonOverlapCost); | ||
|
|
||
| graph.solveMinCostFlow(); | ||
| final long cost = graph.totalCost(); | ||
|
|
||
| assignActiveTaskFromMinCostFlow(graph, activeTasks, clientList, taskIdList, | ||
| clientStates, originalAssignedTaskNumber, taskClientMap); | ||
|
|
||
| return cost; | ||
| } | ||
|
|
||
| private Graph<Integer> constructActiveTaskGraph(final SortedSet<TaskId> activeTasks, | ||
| final List<UUID> clientList, | ||
| final List<TaskId> taskIdList, | ||
| final Map<UUID, ClientState> clientStates, | ||
| final Map<TaskId, UUID> taskClientMap, | ||
| final Map<UUID, Integer> originalAssignedTaskNumber, | ||
| final int trafficCost, | ||
| final int nonOverlapCost) { | ||
| final Graph<Integer> graph = new Graph<>(); | ||
|
|
||
| for (final TaskId taskId : activeTasks) { | ||
| for (final Entry<UUID, ClientState> clientState : clientStates.entrySet()) { | ||
| if (clientState.getValue().hasAssignedTask(taskId)) { | ||
| originalAssignedTaskNumber.merge(clientState.getKey(), 1, Integer::sum); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Make task and client Node id in graph deterministic | ||
| for (int taskNodeId = 0; taskNodeId < taskIdList.size(); taskNodeId++) { | ||
| final TaskId taskId = taskIdList.get(taskNodeId); | ||
| for (int j = 0; j < clientList.size(); j++) { | ||
| final int clientNodeId = taskIdList.size() + j; | ||
| final UUID processId = clientList.get(j); | ||
|
|
||
| final int flow = clientStates.get(processId).hasAssignedTask(taskId) ? 1 : 0; | ||
| final int cost = getCost(taskId, processId, flow == 1, trafficCost, nonOverlapCost); | ||
| if (flow == 1) { | ||
| if (taskClientMap.containsKey(taskId)) { | ||
| throw new IllegalArgumentException("Task " + taskId + " assigned to multiple clients " | ||
| + processId + ", " + taskClientMap.get(taskId)); | ||
| } | ||
| taskClientMap.put(taskId, processId); | ||
| } | ||
|
|
||
| graph.addEdge(taskNodeId, clientNodeId, 1, cost, flow); | ||
| } | ||
| if (!taskClientMap.containsKey(taskId)) { | ||
| throw new IllegalArgumentException("Task " + taskId + " not assigned to any client"); | ||
| } | ||
| } | ||
|
|
||
| final int sinkId = getSinkID(clientList, taskIdList); | ||
| for (int taskNodeId = 0; taskNodeId < taskIdList.size(); taskNodeId++) { | ||
| graph.addEdge(SOURCE_ID, taskNodeId, 1, 0, 1); | ||
| } | ||
|
|
||
| // It's possible that some clients have 0 task assign. These clients will have 0 tasks assigned | ||
| // even though it may have higher traffic cost. This is to maintain the original assigned task count | ||
| for (int i = 0; i < clientList.size(); i++) { | ||
| final int clientNodeId = taskIdList.size() + i; | ||
| final int capacity = originalAssignedTaskNumber.getOrDefault(clientList.get(i), 0); | ||
| // Flow equals to capacity for edges to sink | ||
| graph.addEdge(clientNodeId, sinkId, capacity, 0, capacity); | ||
| } | ||
|
|
||
| graph.setSourceNode(SOURCE_ID); | ||
| graph.setSinkNode(sinkId); | ||
|
|
||
| return graph; | ||
| } | ||
|
|
||
| private void assignActiveTaskFromMinCostFlow(final Graph<Integer> graph, | ||
| final SortedSet<TaskId> activeTasks, | ||
| final List<UUID> clientList, | ||
| final List<TaskId> taskIdList, | ||
| final Map<UUID, ClientState> clientStates, | ||
| final Map<UUID, Integer> originalAssignedTaskNumber, | ||
| final Map<TaskId, UUID> taskClientMap) { | ||
| int tasksAssigned = 0; | ||
| for (int taskNodeId = 0; taskNodeId < taskIdList.size(); taskNodeId++) { | ||
| final TaskId taskId = taskIdList.get(taskNodeId); | ||
| final Map<Integer, Graph<Integer>.Edge> edges = graph.edges(taskNodeId); | ||
| for (final Graph<Integer>.Edge edge : edges.values()) { | ||
| if (edge.flow > 0) { | ||
| tasksAssigned++; | ||
| final int clientIndex = edge.destination - taskIdList.size(); | ||
| final UUID processId = clientList.get(clientIndex); | ||
| final UUID originalProcessId = taskClientMap.get(taskId); | ||
|
|
||
| // Don't need to assign this task to other client | ||
| if (processId.equals(originalProcessId)) { | ||
| break; | ||
| } | ||
|
|
||
| clientStates.get(originalProcessId).unassignActive(taskId); | ||
| clientStates.get(processId).assignActive(taskId); | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we Or even replace the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. There should be only one edge. I didn't break here for the validations below to catch anything wrong |
||
| } | ||
| } | ||
|
|
||
| // Validate task assigned | ||
| if (tasksAssigned != activeTasks.size()) { | ||
| throw new IllegalStateException("Computed active task assignment number " | ||
| + tasksAssigned + " is different size " + activeTasks.size()); | ||
| } | ||
|
|
||
| // Validate original assigned task number matches | ||
| final Map<UUID, Integer> assignedTaskNumber = new HashMap<>(); | ||
| for (final TaskId taskId : activeTasks) { | ||
| for (final Entry<UUID, ClientState> clientState : clientStates.entrySet()) { | ||
| if (clientState.getValue().hasAssignedTask(taskId)) { | ||
| assignedTaskNumber.merge(clientState.getKey(), 1, Integer::sum); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (originalAssignedTaskNumber.size() != assignedTaskNumber.size()) { | ||
| throw new IllegalStateException("There are " + originalAssignedTaskNumber.size() + " clients have " | ||
| + " active tasks before assignment, but " + assignedTaskNumber.size() + " clients have" | ||
| + " active tasks after assignment"); | ||
| } | ||
|
|
||
| for (final Entry<UUID, Integer> originalCapacity : originalAssignedTaskNumber.entrySet()) { | ||
| final int capacity = assignedTaskNumber.getOrDefault(originalCapacity.getKey(), 0); | ||
| if (!Objects.equals(originalCapacity.getValue(), capacity)) { | ||
| throw new IllegalStateException("There are " + originalCapacity.getValue() + " tasks assigned to" | ||
| + " client " + originalCapacity.getKey() + " before assignment, but " + capacity + " tasks " | ||
| + " are assigned to it after assignment"); | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo one line below:
D[e]scribe