Skip to content

Commit

Permalink
feature: report multi-hop fabric connections
Browse files Browse the repository at this point in the history
In additon to physical connections we should also report multi-hop
logical connections (MDFI + XeLink) as have positive bandwidth.

Use a modified BFS algorithm to try to find a path between fabric
vertices that are not directly connected together because the KMD always
try to use MDFI link first, then go to XeLink.

Multi-hop connections are bi-directional but might not be symmetric, so
for every pair of vertices A & B that are not directly connected, we
need to try to find both `A -> B` and `B -> A`.

Related-To: GSD-7126

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>
  • Loading branch information
lyu authored and Compute-Runtime-Automation committed Mar 6, 2024
1 parent a04c67e commit a0faad6
Show file tree
Hide file tree
Showing 15 changed files with 1,623 additions and 118 deletions.
2 changes: 1 addition & 1 deletion level_zero/core/source/device/device_imp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ void DeviceImp::getP2PPropertiesDirectFabricConnection(DeviceImp *peerDeviceImp,
ze_fabric_edge_exp_properties_t edgeProperties{};
fabricEdge->getProperties(&edgeProperties);

if (strcmp(edgeProperties.model, "XeLink") == 0) {
if (strstr(edgeProperties.model, "XeLink") != nullptr) {
bandwidthPropertiesDesc->logicalBandwidth = edgeProperties.bandwidth;
bandwidthPropertiesDesc->physicalBandwidth = edgeProperties.bandwidth;
bandwidthPropertiesDesc->bandwidthUnit = edgeProperties.bandwidthUnit;
Expand Down
26 changes: 21 additions & 5 deletions level_zero/core/source/driver/driver_handle_imp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,11 @@ DriverHandleImp::~DriverHandleImp() {
}
this->fabricEdges.clear();

for (auto &edge : this->fabricIndirectEdges) {
delete edge;
}
this->fabricIndirectEdges.clear();

if (this->svmAllocsManager) {
this->svmAllocsManager->trimUSMDeviceAllocCache();
delete this->svmAllocsManager;
Expand Down Expand Up @@ -903,7 +908,7 @@ void DriverHandleImp::initializeVertexes() {
this->fabricVertices.push_back(fabricVertex);
}

FabricEdge::createEdgesFromVertices(this->fabricVertices, this->fabricEdges);
FabricEdge::createEdgesFromVertices(this->fabricVertices, this->fabricEdges, this->fabricIndirectEdges);
}

ze_result_t DriverHandleImp::fabricVertexGetExp(uint32_t *pCount, ze_fabric_vertex_handle_t *phVertices) {
Expand Down Expand Up @@ -957,17 +962,20 @@ ze_result_t DriverHandleImp::fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA
bool updateEdges = false;

if (*pCount == 0) {
maxEdges = static_cast<uint32_t>(fabricEdges.size());
maxEdges = static_cast<uint32_t>(fabricEdges.size() + fabricIndirectEdges.size());
} else {
maxEdges = std::min<uint32_t>(*pCount, static_cast<uint32_t>(fabricEdges.size()));
maxEdges = std::min<uint32_t>(*pCount, static_cast<uint32_t>(fabricEdges.size() + fabricIndirectEdges.size()));
}

if (phEdges != nullptr) {
updateEdges = true;
}

for (const auto &edge : fabricEdges) {
// Fabric Connections are bi-directional
if (edgeUpdateIndex >= maxEdges) {
break;
}
// Direct physical fabric connections are bi-directional
if ((edge->vertexA == queryVertexA && edge->vertexB == queryVertexB) ||
(edge->vertexA == queryVertexB && edge->vertexB == queryVertexA)) {

Expand All @@ -976,11 +984,19 @@ ze_result_t DriverHandleImp::fabricEdgeGetExp(ze_fabric_vertex_handle_t hVertexA
}
++edgeUpdateIndex;
}
}

// Stop if the edges overflow the count
for (const auto &edge : fabricIndirectEdges) {
if (edgeUpdateIndex >= maxEdges) {
break;
}
// Logical multi-hop edges might not be symmetric
if (edge->vertexA == queryVertexA && edge->vertexB == queryVertexB) {
if (updateEdges == true) {
phEdges[edgeUpdateIndex] = edge->toHandle();
}
++edgeUpdateIndex;
}
}

*pCount = edgeUpdateIndex;
Expand Down
1 change: 1 addition & 0 deletions level_zero/core/source/driver/driver_handle_imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ struct DriverHandleImp : public DriverHandle {
std::vector<Device *> devices;
std::vector<FabricVertex *> fabricVertices;
std::vector<FabricEdge *> fabricEdges;
std::vector<FabricEdge *> fabricIndirectEdges;

std::mutex rtasLock;

Expand Down
29 changes: 1 addition & 28 deletions level_zero/core/source/fabric/fabric.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2023 Intel Corporation
* Copyright (C) 2022-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -121,31 +121,4 @@ FabricEdge *FabricEdge::create(FabricVertex *vertexA, FabricVertex *vertexB, ze_
return edge;
}

void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges) {

// Get all vertices and sub-vertices
std::vector<FabricVertex *> allVertices = {};
for (auto &fabricVertex : vertices) {
allVertices.push_back(fabricVertex);
for (auto &fabricSubVertex : fabricVertex->subVertices) {
allVertices.push_back(fabricSubVertex);
}
}

// Get edges between all vertices
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
ze_fabric_edge_exp_properties_t edgeProperty = {};

for (auto const &fabricDeviceInterface : allVertices[vertexAIndex]->pFabricDeviceInterfaces) {
bool isConnected =
fabricDeviceInterface.second->getEdgeProperty(allVertices[vertexBIndex], edgeProperty);
if (isConnected) {
edges.push_back(create(allVertices[vertexAIndex], allVertices[vertexBIndex], edgeProperty));
}
}
}
}
}

} // namespace L0
4 changes: 2 additions & 2 deletions level_zero/core/source/fabric/fabric.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -46,7 +46,7 @@ struct FabricEdge : _ze_fabric_edge_handle_t {
public:
virtual ~FabricEdge() = default;

static void createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges);
static void createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &indirectEdges);
static FabricEdge *create(FabricVertex *vertexA, FabricVertex *vertexB, ze_fabric_edge_exp_properties_t &properties);
ze_result_t getProperties(ze_fabric_edge_exp_properties_t *pEdgeProperties) const {
*pEdgeProperties = properties;
Expand Down
4 changes: 3 additions & 1 deletion level_zero/core/source/fabric/linux/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (C) 2022-2023 Intel Corporation
# Copyright (C) 2022-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
Expand All @@ -10,13 +10,15 @@ if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.h
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.cpp
)
else()
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf_stub.h
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf_stub.cpp
)
Expand Down
149 changes: 149 additions & 0 deletions level_zero/core/source/fabric/linux/fabric.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#include "level_zero/core/source/fabric/fabric.h"

#include "shared/source/helpers/debug_helpers.h"

#include <algorithm>
#include <cstring>
#include <deque>
#include <limits>
#include <map>
#include <string>
#include <vector>

namespace L0 {

void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &indirectEdges) {

// Get all vertices and sub-vertices
std::vector<FabricVertex *> allVertices = {};
for (auto &fabricVertex : vertices) {
allVertices.push_back(fabricVertex);
for (auto &fabricSubVertex : fabricVertex->subVertices) {
allVertices.push_back(fabricSubVertex);
}
}

// Get direct physical edges between all vertices
std::map<uint32_t, std::vector<std::pair<uint32_t, ze_fabric_edge_exp_properties_t *>>> adjacentVerticesMap;
std::map<uint32_t, std::vector<uint32_t>> nonAdjacentVerticesMap;
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
bool isAdjacent = false;
auto vertexA = allVertices[vertexAIndex];
auto vertexB = allVertices[vertexBIndex];
ze_fabric_edge_exp_properties_t edgeProperty = {};

for (auto const &fabricDeviceInterface : vertexA->pFabricDeviceInterfaces) {
bool isConnected =
fabricDeviceInterface.second->getEdgeProperty(vertexB, edgeProperty);
if (isConnected) {
edges.push_back(create(vertexA, vertexB, edgeProperty));
adjacentVerticesMap[vertexAIndex].emplace_back(vertexBIndex, &edges.back()->properties);
adjacentVerticesMap[vertexBIndex].emplace_back(vertexAIndex, &edges.back()->properties);
isAdjacent = true;
}
}
if (!isAdjacent) {
auto &subVerticesOfA = vertexA->subVertices;
if (std::find(subVerticesOfA.begin(), subVerticesOfA.end(), vertexB) == subVerticesOfA.end()) {
nonAdjacentVerticesMap[vertexAIndex].push_back(vertexBIndex);
nonAdjacentVerticesMap[vertexBIndex].push_back(vertexAIndex);
}
}
}
}

// Find logical multi-hop edges between vertices not directly connected
for (const auto &[vertexAIndex, nonAdjacentVertices] : nonAdjacentVerticesMap) {
for (auto vertexBIndex : nonAdjacentVertices) {
std::map<uint32_t, uint32_t> visited;
visited[vertexAIndex] = vertexAIndex;

std::deque<uint32_t> toVisit;
toVisit.push_back(vertexAIndex);

uint32_t currVertexIndex = vertexAIndex;

while (true) {
std::deque<uint32_t> toVisitIaf, toVisitMdfi;
while (!toVisit.empty()) {
currVertexIndex = toVisit.front();
toVisit.pop_front();
if (currVertexIndex == vertexBIndex) {
break;
}

for (auto [vertexIndex, edgeProperty] : adjacentVerticesMap[currVertexIndex]) {
if (visited.find(vertexIndex) == visited.end()) {
if (strncmp(edgeProperty->model, "XeLink", 7) == 0) {
toVisitIaf.push_back(vertexIndex);
} else {
DEBUG_BREAK_IF(strncmp(edgeProperty->model, "MDFI", 5) != 0);
toVisitMdfi.push_back(vertexIndex);
}
visited[vertexIndex] = currVertexIndex;
}
}
}

if (currVertexIndex != vertexBIndex) {
if (toVisitIaf.size() + toVisitMdfi.size() != 0) {
toVisit = toVisitMdfi;
toVisit.insert(toVisit.end(), toVisitIaf.begin(), toVisitIaf.end());
} else {
break;
}
} else {
std::string path = "";
ze_fabric_edge_exp_properties_t properties = {};
properties.stype = ZE_STRUCTURE_TYPE_FABRIC_EDGE_EXP_PROPERTIES;
properties.pNext = nullptr;
memset(properties.uuid.id, 0, ZE_MAX_UUID_SIZE);
memset(properties.model, 0, ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE);
properties.bandwidth = std::numeric_limits<uint32_t>::max();
properties.bandwidthUnit = ZE_BANDWIDTH_UNIT_BYTES_PER_NANOSEC;
properties.latency = std::numeric_limits<uint32_t>::max();
properties.latencyUnit = ZE_LATENCY_UNIT_UNKNOWN;
properties.duplexity = ZE_FABRIC_EDGE_EXP_DUPLEXITY_FULL_DUPLEX;

while (true) {
const auto parentIndex = visited[currVertexIndex];
ze_fabric_edge_exp_properties_t *currEdgeProperty = nullptr;
for (const auto &[vertexIndex, edgeProperty] : adjacentVerticesMap[parentIndex]) {
if (vertexIndex == currVertexIndex) {
currEdgeProperty = edgeProperty;
break;
}
}
UNRECOVERABLE_IF(currEdgeProperty == nullptr);
path = std::string(currEdgeProperty->model) + path;
if ((strncmp(currEdgeProperty->model, "XeLink", 7) == 0) &&
(currEdgeProperty->bandwidth < properties.bandwidth)) {
properties.bandwidth = currEdgeProperty->bandwidth;
}

currVertexIndex = parentIndex;
if (currVertexIndex == vertexAIndex) {
path.resize(ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE - 1, '\0');
path.copy(properties.model, path.size());
break;
} else {
path = '-' + path;
}
}
indirectEdges.push_back(create(allVertices[vertexAIndex], allVertices[vertexBIndex], properties));
break;
}
}
}
}
}

} // namespace L0
3 changes: 2 additions & 1 deletion level_zero/core/source/fabric/windows/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (C) 2022-2023 Intel Corporation
# Copyright (C) 2022-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
Expand All @@ -8,6 +8,7 @@ if(WIN32)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.h
${CMAKE_CURRENT_SOURCE_DIR}/fabric_device_iaf.cpp
)
Expand Down
43 changes: 43 additions & 0 deletions level_zero/core/source/fabric/windows/fabric.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#include "level_zero/core/source/fabric/fabric.h"

#include <vector>

namespace L0 {

void FabricEdge::createEdgesFromVertices(const std::vector<FabricVertex *> &vertices, std::vector<FabricEdge *> &edges, std::vector<FabricEdge *> &) {

// Get all vertices and sub-vertices
std::vector<FabricVertex *> allVertices = {};
for (auto &fabricVertex : vertices) {
allVertices.push_back(fabricVertex);
for (auto &fabricSubVertex : fabricVertex->subVertices) {
allVertices.push_back(fabricSubVertex);
}
}

// Get direct physical edges between all vertices
for (uint32_t vertexAIndex = 0; vertexAIndex < allVertices.size(); vertexAIndex++) {
for (uint32_t vertexBIndex = vertexAIndex + 1; vertexBIndex < allVertices.size(); vertexBIndex++) {
auto vertexA = allVertices[vertexAIndex];
auto vertexB = allVertices[vertexBIndex];
ze_fabric_edge_exp_properties_t edgeProperty = {};

for (auto const &fabricDeviceInterface : vertexA->pFabricDeviceInterfaces) {
bool isConnected =
fabricDeviceInterface.second->getEdgeProperty(vertexB, edgeProperty);
if (isConnected) {
edges.push_back(create(vertexA, vertexB, edgeProperty));
}
}
}
}
}

} // namespace L0
Loading

0 comments on commit a0faad6

Please sign in to comment.