Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ public class ConfigOptions {
+ " (“50100,50101”), ranges (“50100-50200”) or a combination of both."
+ "This option is deprecated. Please use bind.listeners instead, which provides a more flexible configuration for multiple ports");

public static final ConfigOption<Integer> COORDINATOR_ID =
key("coordinator.id")
.intType()
.noDefaultValue()
.withDescription("The id for the coordinator server.");

/**
* @deprecated This option is deprecated. Please use {@link ConfigOptions#SERVER_IO_POOL_SIZE}
* instead.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.fluss.exception;

/** Exception thrown when the Coordinator leader epoch is invalid. */
public class CoordinatorEpochFencedException extends RuntimeException {
public CoordinatorEpochFencedException(String message) {
super(message);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class MetricNames {
// metrics for coordinator server
// --------------------------------------------------------------------------------------------
public static final String ACTIVE_COORDINATOR_COUNT = "activeCoordinatorCount";
public static final String ALIVE_COORDINATOR_COUNT = "aliveCoordinatorCount";
public static final String ACTIVE_TABLET_SERVER_COUNT = "activeTabletServerCount";
public static final String OFFLINE_BUCKET_COUNT = "offlineBucketCount";
public static final String TABLE_COUNT = "tableCount";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class CoordinatorContext {
private static final Logger LOG = LoggerFactory.getLogger(CoordinatorContext.class);

public static final int INITIAL_COORDINATOR_EPOCH = 0;
public static final int INITIAL_COORDINATOR_EPOCH_ZK_VERSION = 0;

// for simplicity, we just use retry time, may consider make it a configurable value
// and use combine retry times and retry delay
Expand All @@ -67,6 +68,7 @@ public class CoordinatorContext {
// a success deletion.
private final Map<TableBucketReplica, Integer> failDeleteNumbers = new HashMap<>();

private final Set<Integer> liveCoordinatorServers = new HashSet<>();
private final Map<Integer, ServerInfo> liveTabletServers = new HashMap<>();
private final Set<Integer> shuttingDownTabletServers = new HashSet<>();

Expand Down Expand Up @@ -108,13 +110,40 @@ public class CoordinatorContext {

private ServerInfo coordinatorServerInfo = null;
private int coordinatorEpoch = INITIAL_COORDINATOR_EPOCH;
private int coordinatorEpochZkVersion = INITIAL_COORDINATOR_EPOCH_ZK_VERSION;

public CoordinatorContext() {}

public int getCoordinatorEpoch() {
return coordinatorEpoch;
}

public int getCoordinatorEpochZkVersion() {
return coordinatorEpochZkVersion;
}

public void setCoordinatorEpochAndZkVersion(int newEpoch, int newZkVersion) {
this.coordinatorEpoch = newEpoch;
this.coordinatorEpochZkVersion = newZkVersion;
}

public Set<Integer> getLiveCoordinatorServers() {
return liveCoordinatorServers;
}

public void setLiveCoordinatorServers(Set<Integer> servers) {
liveCoordinatorServers.clear();
liveCoordinatorServers.addAll(servers);
}

public void addLiveCoordinatorServer(int serverId) {
this.liveCoordinatorServers.add(serverId);
}

public void removeLiveCoordinatorServer(int serverId) {
this.liveCoordinatorServers.remove(serverId);
}

public Map<Integer, ServerInfo> getLiveTabletServers() {
return liveTabletServers;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,21 @@
import org.apache.fluss.server.coordinator.event.CoordinatorEventManager;
import org.apache.fluss.server.coordinator.event.CreatePartitionEvent;
import org.apache.fluss.server.coordinator.event.CreateTableEvent;
import org.apache.fluss.server.coordinator.event.DeadCoordinatorServerEvent;
import org.apache.fluss.server.coordinator.event.DeadTabletServerEvent;
import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
import org.apache.fluss.server.coordinator.event.DropPartitionEvent;
import org.apache.fluss.server.coordinator.event.DropTableEvent;
import org.apache.fluss.server.coordinator.event.EventProcessor;
import org.apache.fluss.server.coordinator.event.FencedCoordinatorEvent;
import org.apache.fluss.server.coordinator.event.NewCoordinatorServerEvent;
import org.apache.fluss.server.coordinator.event.NewTabletServerEvent;
import org.apache.fluss.server.coordinator.event.NotifyKvSnapshotOffsetEvent;
import org.apache.fluss.server.coordinator.event.NotifyLakeTableOffsetEvent;
import org.apache.fluss.server.coordinator.event.NotifyLeaderAndIsrResponseReceivedEvent;
import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent;
import org.apache.fluss.server.coordinator.event.SchemaChangeEvent;
import org.apache.fluss.server.coordinator.event.watcher.CoordinatorServerChangeWatcher;
import org.apache.fluss.server.coordinator.event.watcher.TableChangeWatcher;
import org.apache.fluss.server.coordinator.event.watcher.TabletServerChangeWatcher;
import org.apache.fluss.server.coordinator.statemachine.ReplicaStateMachine;
Expand Down Expand Up @@ -112,6 +115,7 @@
import javax.annotation.concurrent.NotThreadSafe;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -151,6 +155,7 @@ public class CoordinatorEventProcessor implements EventProcessor {
private final LakeTableTieringManager lakeTableTieringManager;
private final TableChangeWatcher tableChangeWatcher;
private final CoordinatorChannelManager coordinatorChannelManager;
private final CoordinatorServerChangeWatcher coordinatorServerChangeWatcher;
private final TabletServerChangeWatcher tabletServerChangeWatcher;
private final CoordinatorMetadataCache serverMetadataCache;
private final CoordinatorRequestBatch coordinatorRequestBatch;
Expand Down Expand Up @@ -202,6 +207,8 @@ public CoordinatorEventProcessor(
tableBucketStateMachine,
new RemoteStorageCleaner(conf, ioExecutor),
ioExecutor);
this.coordinatorServerChangeWatcher =
new CoordinatorServerChangeWatcher(zooKeeperClient, coordinatorEventManager);
this.tableChangeWatcher = new TableChangeWatcher(zooKeeperClient, coordinatorEventManager);
this.tabletServerChangeWatcher =
new TabletServerChangeWatcher(zooKeeperClient, coordinatorEventManager);
Expand Down Expand Up @@ -230,6 +237,7 @@ public CoordinatorEventManager getCoordinatorEventManager() {
public void startup() {
coordinatorContext.setCoordinatorServerInfo(getCoordinatorServerInfo());
// start watchers first so that we won't miss node in zk;
coordinatorServerChangeWatcher.start();
tabletServerChangeWatcher.start();
tableChangeWatcher.start();
LOG.info("Initializing coordinator context.");
Expand Down Expand Up @@ -267,7 +275,7 @@ public void shutdown() {
private ServerInfo getCoordinatorServerInfo() {
try {
return zooKeeperClient
.getCoordinatorAddress()
.getCoordinatorLeaderAddress()
.map(
coordinatorAddress ->
// TODO we set id to 0 as that CoordinatorServer don't support
Expand Down Expand Up @@ -295,6 +303,12 @@ public int getCoordinatorEpoch() {

private void initCoordinatorContext() throws Exception {
long start = System.currentTimeMillis();
// get all coordinator servers
int[] currentCoordinatorServers = zooKeeperClient.getCoordinatorServerList();
coordinatorContext.setLiveCoordinatorServers(
Arrays.stream(currentCoordinatorServers).boxed().collect(Collectors.toSet()));
LOG.info("Load coordinator servers success when initializing coordinator context.");

// get all tablet server's
int[] currentServers = zooKeeperClient.getSortedTabletServerList();
List<ServerInfo> tabletServerInfos = new ArrayList<>();
Expand Down Expand Up @@ -509,6 +523,7 @@ private void onShutdown() {
tableManager.shutdown();

// then stop watchers
coordinatorServerChangeWatcher.stop();
tableChangeWatcher.stop();
tabletServerChangeWatcher.stop();
}
Expand All @@ -531,6 +546,10 @@ public void process(CoordinatorEvent event) {
(NotifyLeaderAndIsrResponseReceivedEvent) event);
} else if (event instanceof DeleteReplicaResponseReceivedEvent) {
processDeleteReplicaResponseReceived((DeleteReplicaResponseReceivedEvent) event);
} else if (event instanceof NewCoordinatorServerEvent) {
processNewCoordinatorServer((NewCoordinatorServerEvent) event);
} else if (event instanceof DeadCoordinatorServerEvent) {
processDeadCoordinatorServer((DeadCoordinatorServerEvent) event);
} else if (event instanceof NewTabletServerEvent) {
processNewTabletServer((NewTabletServerEvent) event);
} else if (event instanceof DeadTabletServerEvent) {
Expand Down Expand Up @@ -868,6 +887,29 @@ private void onReplicaBecomeOffline(Set<TableBucketReplica> offlineReplicas) {
replicaStateMachine.handleStateChanges(offlineReplicas, OfflineReplica);
}

private void processNewCoordinatorServer(NewCoordinatorServerEvent newCoordinatorServerEvent) {
int coordinatorServerId = newCoordinatorServerEvent.getServerId();
if (coordinatorContext.getLiveCoordinatorServers().contains(coordinatorServerId)) {
return;
}

// process new coordinator server
LOG.info("New coordinator server callback for coordinator server {}", coordinatorServerId);

coordinatorContext.addLiveCoordinatorServer(coordinatorServerId);
}

private void processDeadCoordinatorServer(
DeadCoordinatorServerEvent deadCoordinatorServerEvent) {
int coordinatorServerId = deadCoordinatorServerEvent.getServerId();
if (!coordinatorContext.getLiveCoordinatorServers().contains(coordinatorServerId)) {
return;
}
// process dead coordinator server
LOG.info("Coordinator server failure callback for {}.", coordinatorServerId);
coordinatorContext.removeLiveCoordinatorServer(coordinatorServerId);
}

private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
// NOTE: we won't need to detect bounced tablet servers like Kafka as we won't
// miss the event of tablet server un-register and register again since we can
Expand All @@ -890,7 +932,7 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
// it may happen during coordinator server initiation, the watcher watch a new tablet
// server register event and put it to event manager, but after that, the coordinator
// server read
// all tablet server nodes registered which contain the tablet server a; in this case,
// all tablet server nodes registered which contain the tablet server; in this case,
// we can ignore it.
return;
}
Expand Down Expand Up @@ -1139,7 +1181,8 @@ private List<AdjustIsrResultForBucket> tryProcessAdjustIsr(
}

try {
zooKeeperClient.batchUpdateLeaderAndIsr(newLeaderAndIsrList);
zooKeeperClient.batchUpdateLeaderAndIsr(
newLeaderAndIsrList, coordinatorContext.getCoordinatorEpochZkVersion());
newLeaderAndIsrList.forEach(
(tableBucket, newLeaderAndIsr) ->
result.add(new AdjustIsrResultForBucket(tableBucket, newLeaderAndIsr)));
Expand All @@ -1150,7 +1193,10 @@ private List<AdjustIsrResultForBucket> tryProcessAdjustIsr(
TableBucket tableBucket = entry.getKey();
LeaderAndIsr newLeaderAndIsr = entry.getValue();
try {
zooKeeperClient.updateLeaderAndIsr(tableBucket, newLeaderAndIsr);
zooKeeperClient.updateLeaderAndIsr(
tableBucket,
newLeaderAndIsr,
coordinatorContext.getCoordinatorEpochZkVersion());
} catch (Exception e) {
LOG.error("Error when register leader and isr.", e);
result.add(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.fluss.server.coordinator;

import org.apache.fluss.exception.CoordinatorEpochFencedException;
import org.apache.fluss.server.zk.ZooKeeperClient;
import org.apache.fluss.server.zk.data.ZkData;
import org.apache.fluss.shaded.curator5.org.apache.curator.framework.recipes.leader.LeaderLatch;
import org.apache.fluss.shaded.curator5.org.apache.curator.framework.recipes.leader.LeaderLatchListener;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;

/** Using by coordinator server. Coordinator servers listen ZK node and elect leadership. */
public class CoordinatorLeaderElection implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(CoordinatorLeaderElection.class);

private final int serverId;
private final ZooKeeperClient zkClient;
private final CoordinatorContext coordinatorContext;
private final LeaderLatch leaderLatch;
private final AtomicBoolean isLeader = new AtomicBoolean(false);

public CoordinatorLeaderElection(
ZooKeeperClient zkClient, int serverId, CoordinatorContext coordinatorContext) {
this.serverId = serverId;
this.zkClient = zkClient;
this.coordinatorContext = coordinatorContext;
this.leaderLatch =
new LeaderLatch(
zkClient.getCuratorClient(),
ZkData.CoordinatorElectionZNode.path(),
String.valueOf(serverId));
}

public void startElectLeader(Runnable initLeaderServices) {
leaderLatch.addListener(
new LeaderLatchListener() {
@Override
public void isLeader() {
LOG.info("Coordinator server {} has become the leader.", serverId);
isLeader.set(true);
try {
// to avoid split-brain
Optional<Integer> optionalEpoch =
zkClient.fenceBecomeCoordinatorLeader(serverId);
if (optionalEpoch.isPresent()) {
coordinatorContext.setCoordinatorEpochAndZkVersion(
optionalEpoch.get(),
coordinatorContext.getCoordinatorEpochZkVersion() + 1);
initLeaderServices.run();
} else {
throw new CoordinatorEpochFencedException(
"Fenced to become coordinator leader.");
}
} catch (Exception e) {
relinquishLeadership();
throw new CoordinatorEpochFencedException(
"Fenced to become coordinator leader.");
}
}

@Override
public void notLeader() {
relinquishLeadership();
LOG.warn("Coordinator server {} has lost the leadership.", serverId);
isLeader.set(false);
}
});

try {
leaderLatch.start();
LOG.info("Coordinator server {} started leader election.", serverId);

// todo: Currently, we await the leader latch and do nothing until it becomes leader.
// Later we can make it as a hot backup server to continuously synchronize metadata from
// Zookeeper, which save time from initializing context
// leaderLatch.await();

} catch (Exception e) {
LOG.error("Failed to start LeaderLatch for server {}", serverId, e);
throw new RuntimeException("Leader election start failed", e);
}
}

@Override
public void close() {
LOG.info("Closing LeaderLatch for server {}.", serverId);
if (leaderLatch != null) {
try {
leaderLatch.close();
} catch (Exception e) {
LOG.error("Failed to close LeaderLatch for server {}.", serverId, e);
}
}
}

public boolean isLeader() {
return this.isLeader.get();
}

private void relinquishLeadership() {
isLeader.set(false);
LOG.info("Coordinator server {} has been fenced.", serverId);

this.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ private UpdateMetadataRequest buildUpdateMetadataRequest() {
// tablet servers.
return makeUpdateMetadataRequest(
coordinatorContext.getCoordinatorServerInfo(),
coordinatorContext.getCoordinatorEpoch(),
new HashSet<>(coordinatorContext.getLiveTabletServers().values()),
tableMetadataList,
partitionMetadataList);
Expand Down
Loading