-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
366 changed files
with
109,008 additions
and
7,479 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "tensorrt_llm/runtime/cudaEvent.h" | ||
#include <atomic> | ||
#include <condition_variable> | ||
#include <mutex> | ||
#include <vector> | ||
|
||
namespace tensorrt_llm::batch_manager | ||
{ | ||
|
||
// Use to track progress of context phase in dist-serving | ||
class ContextProgress | ||
{ | ||
public: | ||
ContextProgress(int numLayers); | ||
|
||
void recordEvent(int layerIdx, cudaStream_t stream); | ||
|
||
void wait(int layerIdx); | ||
|
||
int getNumLayers() const | ||
{ | ||
return mCudaEvents.size(); | ||
} | ||
|
||
cudaEvent_t getEvent(int layerIdx) | ||
{ | ||
return mCudaEvents.at(layerIdx).get(); | ||
} | ||
|
||
private: | ||
std::mutex mMutex; | ||
std::condition_variable mConditionVariable; | ||
std::unique_ptr<std::atomic_bool[]> mCudaEventsRecorded; | ||
std::vector<runtime::CudaEvent> mCudaEvents; | ||
}; | ||
|
||
} // namespace tensorrt_llm::batch_manager |
164 changes: 164 additions & 0 deletions
164
cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "tensorrt_llm/common/mpiUtils.h" | ||
#include "tensorrt_llm/runtime/eagleBuffers.h" | ||
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h" | ||
#include "tensorrt_llm/runtime/iTensor.h" | ||
#include "tensorrt_llm/runtime/lookaheadBuffers.h" | ||
#include "tensorrt_llm/runtime/modelConfig.h" | ||
#include "tensorrt_llm/runtime/worldConfig.h" | ||
|
||
#include <optional> | ||
#include <vector> | ||
|
||
namespace tensorrt_llm::runtime | ||
{ | ||
class TllmRuntime; | ||
} // namespace tensorrt_llm::runtime | ||
|
||
namespace tensorrt_llm::batch_manager | ||
{ | ||
|
||
class DecoderStepAsyncSend | ||
{ | ||
public: | ||
using BufferPtr = runtime::IBuffer::SharedPtr; | ||
|
||
DecoderStepAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, BufferPtr const& newOutputTokensHost, | ||
BufferPtr const& finished, BufferPtr const& sequenceLengthsHost, BufferPtr const& cumLogProbsHost, | ||
BufferPtr const& logProbsHost, BufferPtr const& cacheIndirectionOutput, BufferPtr const& acceptedCumSum, | ||
BufferPtr const& packedPaths, BufferPtr const& finishReasonsHost, int peer); | ||
|
||
~DecoderStepAsyncSend(); | ||
|
||
private: | ||
std::shared_ptr<mpi::MpiRequest> mRequest1; | ||
std::shared_ptr<mpi::MpiRequest> mRequest2; | ||
std::shared_ptr<mpi::MpiRequest> mRequest3; | ||
std::shared_ptr<mpi::MpiRequest> mRequest4; | ||
std::shared_ptr<mpi::MpiRequest> mRequest5; | ||
std::shared_ptr<mpi::MpiRequest> mRequest6; | ||
std::shared_ptr<mpi::MpiRequest> mRequest7; | ||
std::shared_ptr<mpi::MpiRequest> mRequest8; | ||
std::shared_ptr<mpi::MpiRequest> mRequest9; | ||
}; | ||
|
||
class DecoderSlotAsyncSend | ||
{ | ||
public: | ||
using TensorPtr = runtime::ITensor::SharedPtr; | ||
|
||
DecoderSlotAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, TensorPtr const& outputIdsView, | ||
TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView, TensorPtr const& logProbsView, | ||
bool returnLogProbs, int peer); | ||
|
||
~DecoderSlotAsyncSend(); | ||
|
||
private: | ||
std::shared_ptr<mpi::MpiRequest> mRequest1; | ||
std::shared_ptr<mpi::MpiRequest> mRequest2; | ||
std::shared_ptr<mpi::MpiRequest> mRequest3; | ||
std::shared_ptr<mpi::MpiRequest> mRequest4; | ||
}; | ||
|
||
class DecoderBuffers | ||
{ | ||
public: | ||
using SizeType32 = runtime::SizeType32; | ||
using TensorPtr = runtime::ITensor::SharedPtr; | ||
|
||
std::vector<TensorPtr> logits; | ||
TensorPtr slotOutputIds; // [mMaxNumRequests, beamWidth, maxSeqLen], outputIds of all batch slots | ||
TensorPtr slotOutputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot | ||
TensorPtr cacheIndirectionInput; | ||
TensorPtr cacheIndirectionOutput; | ||
TensorPtr sequenceLengths; // [mMaxNumRequests] | ||
TensorPtr sequenceLengthsHost; // [mMaxNumRequests] pinned host tensor | ||
TensorPtr finished; // [mMaxNumRequests] pinned host tensor | ||
TensorPtr newOutputTokens; // [maxTokensPerStep, mMaxNumRequests, beamWidth] | ||
TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth] | ||
TensorPtr cumLogProbs; // [mMaxNumRequests, beamWidth] | ||
TensorPtr cumLogProbsHost; // [mMaxNumRequests, beamWidth] | ||
TensorPtr logProbs; // [mMaxNumRequests, beamWidth, maxSeqLen] | ||
TensorPtr logProbsHost; // [mMaxNumRequests, beamWidth, maxSeqLen] | ||
TensorPtr finishReasonsHost; // [mMaxNumRequests, beamWidth] | ||
|
||
class DraftBuffers | ||
{ | ||
public: | ||
TensorPtr nextDraftTokensDevice; // [mMaxNumRequests, maxTokensPerStep-1] | ||
TensorPtr nextDraftTokensHost; // [mMaxNumRequests, maxTokensPerStep-1] | ||
TensorPtr prevDraftTokensLengthsDevice; // [mMaxNumRequests] | ||
TensorPtr prevDraftTokensLengthsHost; // [mMaxNumRequests] | ||
TensorPtr nextDraftTokensLengthsDevice; // [mMaxNumRequests] | ||
TensorPtr nextDraftTokensLengthsHost; // [mMaxNumRequests] | ||
TensorPtr acceptedLengthsCumSumDevice; // [mMaxNumRequests+1] | ||
TensorPtr acceptedPackedPathsDevice; // [mMaxNumRequests * maxAcceptedTokens] | ||
std::vector<std::vector<runtime::ITensor::SharedPtr>> | ||
predictedDraftLogits; // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize] | ||
|
||
void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime, | ||
runtime::ModelConfig const& modelConfig); | ||
}; | ||
|
||
DraftBuffers draftBuffers; | ||
runtime::ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers; | ||
runtime::EagleBuffers::Inputs eagleBuffers; | ||
std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers; | ||
|
||
DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, | ||
SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime, | ||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig); | ||
|
||
std::unique_ptr<DecoderStepAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, | ||
bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, int peer); | ||
|
||
void recv(std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, SizeType32 maxBeamWidth, | ||
bool useMedusa, int peer); | ||
}; | ||
|
||
class SlotDecoderBuffers | ||
{ | ||
public: | ||
using SizeType32 = runtime::SizeType32; | ||
using TensorPtr = runtime::ITensor::SharedPtr; | ||
|
||
TensorPtr outputIds; // [beamWidth, maxSeqLen], outputIds of single batch slot | ||
TensorPtr outputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot | ||
TensorPtr sequenceLengthsHost; // [beamWidth] | ||
TensorPtr cumLogProbs; // [beamWidth] | ||
TensorPtr cumLogProbsHost; // [beamWidth] | ||
TensorPtr logProbs; // [beamWidth, maxSeqLen] | ||
TensorPtr logProbsHost; // [beamWidth, maxSeqLen] | ||
TensorPtr finishReasonsHost; // [beamWidth] | ||
|
||
SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::TllmRuntime const& runtime); | ||
|
||
static std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, | ||
TensorPtr const& outputIdsView, TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView, | ||
TensorPtr const& logProbsView, bool returnLogProbs, int peer); | ||
|
||
std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, | ||
TensorPtr const& sequenceLengthView, bool returnLogProbs, int peer); | ||
|
||
void recv(std::shared_ptr<mpi::MpiComm> const& commSession, TensorPtr const& sequenceLengthView, | ||
bool returnLogProbs, int peer); | ||
}; | ||
|
||
} // namespace tensorrt_llm::batch_manager |
Oops, something went wrong.