Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public RemoteStorageInfo(String path, String confString) {
if (!ArrayUtils.isEmpty(items)) {
for (String item : items) {
String[] kv = item.split(Constants.EQUAL_SPLIT_CHAR);
if (kv != null && kv.length == 2) {
if (kv.length == 2) {
this.confItems.put(kv[0], kv[1]);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.uniffle.coordinator;

import java.io.IOException;
import java.util.Map;

import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.uniffle.common.exception.RssException;

import static org.apache.uniffle.coordinator.LowestIOSampleCostSelectStorageStrategy.RankValue;

/**
* This is a simple implementation class, which provides some methods to check whether the path is normal
*/
public abstract class AbstractSelectStorageStrategy implements SelectStorageStrategy {
/**
* store remote path -> application count for assignment strategy
*/
protected final Map<String, LowestIOSampleCostSelectStorageStrategy.RankValue> remoteStoragePathRankValue;
protected final int fileSize;

public AbstractSelectStorageStrategy(
Map<String, LowestIOSampleCostSelectStorageStrategy.RankValue> remoteStoragePathRankValue,
CoordinatorConf conf) {
this.remoteStoragePathRankValue = remoteStoragePathRankValue;
fileSize = conf.getInteger(CoordinatorConf.COORDINATOR_REMOTE_STORAGE_SCHEDULE_FILE_SIZE);
}

public void readAndWriteHdfsStorage(FileSystem fs, Path testPath,
String uri, RankValue rankValue) throws IOException {
byte[] data = RandomUtils.nextBytes(fileSize);
try (FSDataOutputStream fos = fs.create(testPath)) {
fos.write(data);
fos.flush();
}
byte[] readData = new byte[fileSize];
int readBytes;
try (FSDataInputStream fis = fs.open(testPath)) {
int hasReadBytes = 0;
do {
readBytes = fis.read(readData);
if (hasReadBytes < fileSize) {
for (int i = 0; i < readBytes; i++) {
if (data[hasReadBytes + i] != readData[i]) {
remoteStoragePathRankValue.put(uri, new RankValue(Long.MAX_VALUE, rankValue.getAppNum().get()));
throw new RssException("The content of reading and writing is inconsistent.");
}
}
}
hasReadBytes += readBytes;
} while (readBytes != -1);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,124 +21,114 @@
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.uniffle.common.RemoteStorageInfo;
import org.apache.uniffle.common.filesystem.HadoopFilesystemProvider;
import org.apache.uniffle.coordinator.LowestIOSampleCostSelectStorageStrategy.RankValue;

/**
* AppBalanceSelectStorageStrategy will consider the number of apps allocated on each remote path is balanced.
*/
public class AppBalanceSelectStorageStrategy implements SelectStorageStrategy {
public class AppBalanceSelectStorageStrategy extends AbstractSelectStorageStrategy {

private static final Logger LOG = LoggerFactory.getLogger(AppBalanceSelectStorageStrategy.class);
/**
* store appId -> remote path to make sure all shuffle data of the same application
* will be written to the same remote storage
*/
private final Map<String, RemoteStorageInfo> appIdToRemoteStorageInfo;
/**
* store remote path -> application count for assignment strategy
*/
private final Map<String, RankValue> remoteStoragePathCounter;
private final Map<String, RemoteStorageInfo> appIdToRemoteStorageInfo;
private final Map<String, RemoteStorageInfo> availableRemoteStorageInfo;

public AppBalanceSelectStorageStrategy() {
this.appIdToRemoteStorageInfo = Maps.newConcurrentMap();
this.remoteStoragePathCounter = Maps.newConcurrentMap();
this.availableRemoteStorageInfo = Maps.newHashMap();
private final Configuration hdfsConf;
private List<Map.Entry<String, RankValue>> uris;

public AppBalanceSelectStorageStrategy(
Map<String, RankValue> remoteStoragePathRankValue,
Map<String, RemoteStorageInfo> appIdToRemoteStorageInfo,
Map<String, RemoteStorageInfo> availableRemoteStorageInfo,
CoordinatorConf conf) {
super(remoteStoragePathRankValue, conf);
this.appIdToRemoteStorageInfo = appIdToRemoteStorageInfo;
this.availableRemoteStorageInfo = availableRemoteStorageInfo;
this.hdfsConf = new Configuration();
}

/**
* the strategy of pick remote storage is according to assignment count
*/
@Override
public RemoteStorageInfo pickRemoteStorage(String appId) {
if (appIdToRemoteStorageInfo.containsKey(appId)) {
return appIdToRemoteStorageInfo.get(appId);
}

// create list for sort
List<Map.Entry<String, RankValue>> sizeList =
Lists.newArrayList(remoteStoragePathCounter.entrySet()).stream().filter(Objects::nonNull)
.sorted(Comparator.comparingInt(entry -> entry.getValue().getAppNum().get())).collect(Collectors.toList());

for (Map.Entry<String, RankValue> entry : sizeList) {
String storagePath = entry.getKey();
if (availableRemoteStorageInfo.containsKey(storagePath)) {
appIdToRemoteStorageInfo.putIfAbsent(appId, availableRemoteStorageInfo.get(storagePath));
incRemoteStorageCounter(storagePath);
break;
}
}
return appIdToRemoteStorageInfo.get(appId);
}

@Override
@VisibleForTesting
public synchronized void incRemoteStorageCounter(String remoteStoragePath) {
RankValue counter = remoteStoragePathCounter.get(remoteStoragePath);
if (counter != null) {
counter.getAppNum().incrementAndGet();
} else {
// it may be happened when assignment remote storage
// and refresh remote storage at the same time
LOG.warn("Remote storage path lost during assignment: {} doesn't exist, reset it to 1",
remoteStoragePath);
remoteStoragePathCounter.put(remoteStoragePath, new RankValue(1));
public void sortPathByRankValue(String path, String test) {
RankValue rankValue = remoteStoragePathRankValue.get(path);
try {
FileSystem fs = HadoopFilesystemProvider.getFilesystem(new Path(path), hdfsConf);
fs.delete(new Path(test),true);
if (rankValue.getHealthy().get()) {
rankValue.setCostTime(new AtomicLong(0));
}
} catch (Exception e) {
rankValue.setCostTime(new AtomicLong(Long.MAX_VALUE));
LOG.error("Failed to sort, we will not use this remote path {}.", path, e);
}
uris = Lists.newCopyOnWriteArrayList(remoteStoragePathRankValue.entrySet()).stream()
.filter(Objects::nonNull).collect(Collectors.toList());
}

@Override
@VisibleForTesting
public synchronized void decRemoteStorageCounter(String storagePath) {
if (!StringUtils.isEmpty(storagePath)) {
RankValue atomic = remoteStoragePathCounter.get(storagePath);
if (atomic != null) {
double count = atomic.getAppNum().decrementAndGet();
if (count < 0) {
LOG.warn("Unexpected counter for remote storage: {}, which is {}, reset to 0",
storagePath, count);
atomic.getAppNum().set(0);
public void detectStorage() {
uris = Lists.newCopyOnWriteArrayList(remoteStoragePathRankValue.entrySet());
if (remoteStoragePathRankValue.size() > 1) {
for (Map.Entry<String, RankValue> uri : uris) {
if (uri.getKey().startsWith(ApplicationManager.REMOTE_PATH_SCHEMA.get(0))) {
RankValue rankValue = remoteStoragePathRankValue.get(uri.getKey());
rankValue.setHealthy(new AtomicBoolean(true));
Path remotePath = new Path(uri.getKey());
String rssTest = uri.getKey() + "/rssTest";
Path testPath = new Path(rssTest);
try {
FileSystem fs = HadoopFilesystemProvider.getFilesystem(remotePath, hdfsConf);
readAndWriteHdfsStorage(fs, testPath, uri.getKey(), rankValue);
} catch (Exception e) {
rankValue.setHealthy(new AtomicBoolean(false));
LOG.error("Storage read and write error, we will not use this remote path {}.", uri, e);
} finally {
sortPathByRankValue(uri.getKey(), rssTest);
}
}
} else {
LOG.warn("Can't find counter for remote storage: {}", storagePath);
remoteStoragePathCounter.putIfAbsent(storagePath, new RankValue(0));
}
if (remoteStoragePathCounter.get(storagePath).getAppNum().get() == 0
&& !availableRemoteStorageInfo.containsKey(storagePath)) {
remoteStoragePathCounter.remove(storagePath);
}
}
}

/**
* When choosing the AppBalance strategy, each time you select a path,
* you should know the number of the latest apps in different paths
*/
@Override
public synchronized void removePathFromCounter(String storagePath) {
RankValue atomic = remoteStoragePathCounter.get(storagePath);
if (atomic != null && atomic.getAppNum().get() == 0) {
remoteStoragePathCounter.remove(storagePath);
public synchronized RemoteStorageInfo pickStorage(String appId) {
boolean isUnhealthy =
uris.stream().noneMatch(rv -> rv.getValue().getCostTime().get() != Long.MAX_VALUE);
if (!isUnhealthy) {
// If there is only one unhealthy path, then filter that path
uris = uris.stream().filter(rv -> rv.getValue().getCostTime().get() != Long.MAX_VALUE).sorted(
Comparator.comparingInt(entry -> entry.getValue().getAppNum().get())).collect(Collectors.toList());
} else {
// If all paths are unhealthy, assign paths according to the number of apps
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add some logs and metrics when all paths are unhealthy?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I will add.

uris = uris.stream().sorted(Comparator.comparingInt(
entry -> entry.getValue().getAppNum().get())).collect(Collectors.toList());
}
}

@Override
public Map<String, RemoteStorageInfo> getAppIdToRemoteStorageInfo() {
return appIdToRemoteStorageInfo;
}

@Override
public Map<String, RankValue> getRemoteStoragePathRankValue() {
return remoteStoragePathCounter;
}

@Override
public Map<String, RemoteStorageInfo> getAvailableRemoteStorageInfo() {
return availableRemoteStorageInfo;
LOG.info("The sorted remote path list is: {}", uris);
for (Map.Entry<String, RankValue> entry : uris) {
String storagePath = entry.getKey();
if (availableRemoteStorageInfo.containsKey(storagePath)) {
return appIdToRemoteStorageInfo.computeIfAbsent(appId, x -> availableRemoteStorageInfo.get(storagePath));
}
}
LOG.warn("No remote storage is available, we will default to the first.");
return availableRemoteStorageInfo.values().iterator().next();
}
}
Loading