Skip to content

Commit

Permalink
build: fix multiple definition issue (vectorch-ai#256)
Browse files Browse the repository at this point in the history
- [ ] @liutongxuan will help refactor code to get rid of duplications in
following diffs
  • Loading branch information
guocuimi authored Jul 1, 2024
1 parent b3ff0a9 commit d711c55
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 110 deletions.
1 change: 1 addition & 0 deletions scalellm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pybind_extension(
csrc/sampling_params.cpp
csrc/output.cpp
csrc/llm_handler.cpp
csrc/vlm_handler.cpp
csrc/module.cpp
DEPS
:llm_handler
Expand Down
17 changes: 1 addition & 16 deletions scalellm/csrc/vlm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,6 @@ namespace py = pybind11;
using namespace pybind11::literals;

void init_vlm_handler(py::module_& m) {
py::enum_<Priority>(m, "Priority")
.value("DEFAULT", Priority::NORMAL)
.value("LOW", Priority::LOW)
.value("NORMAL", Priority::NORMAL)
.value("HIGH", Priority::HIGH)
.export_values();

py::class_<std::future<bool>>(m, "Future")
.def("wait",
&std::future<bool>::wait,
py::call_guard<py::gil_scoped_release>())
.def("get",
&std::future<bool>::get,
py::call_guard<py::gil_scoped_release>());

auto vlm_handler =
py::class_<VLMHandler>(m, "VLMHandler")
.def(py::init<const VLMHandler::Options&>(), py::arg("options"))
Expand Down Expand Up @@ -91,7 +76,7 @@ void init_vlm_handler(py::module_& m) {
"cuda_graph_batch_sizes={}, "
"max_tokens_per_batch={}, max_seqs_per_batch={}, "
"num_handling_threads={}, "
"image_input_type={}, image_token_id={},
"image_input_type={}, image_token_id={}, "
"image_input_shape={}, image_feature_size={})"_s.format(
self.model_path_,
self.devices_,
Expand Down
6 changes: 3 additions & 3 deletions src/engine/vlm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
#include "models/model_args.h"
#include "vlm_worker.h"

DEFINE_COUNTER(prepare_input_latency_seconds,
"Latency of preparing input in seconds");
// DEFINE_COUNTER(prepare_input_latency_seconds,
// "Latency of preparing input in seconds");

namespace llm {
namespace {
Expand Down Expand Up @@ -270,7 +270,7 @@ ModelOutput VLMEngine::execute_model(Batch& batch) {
Timer timer;
auto model_inputs = batch.prepare_model_input(options_.num_decoding_tokens(),
adjusted_batch_size);
COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());
// COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());

if (!model_inputs.token_ids.defined()) {
// empty input, just return
Expand Down
28 changes: 14 additions & 14 deletions src/engine/vlm_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@
#include "sampling/sampler.h"

// latency metrics
DEFINE_COUNTER_FAMILY(execution_latency_seconds,
"Execution latency in seconds");
DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
execution_latency_seconds,
{{"stage", "model"}});
DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
execution_latency_seconds,
{{"stage", "logits_processing"}});
DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
execution_latency_seconds,
{{"stage", "sampling"}});
// DEFINE_COUNTER_FAMILY(execution_latency_seconds,
// "Execution latency in seconds");
// DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
// execution_latency_seconds,
// {{"stage", "model"}});
// DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
// execution_latency_seconds,
// {{"stage", "logits_processing"}});
// DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
// execution_latency_seconds,
// {{"stage", "sampling"}});

namespace llm {

Expand Down Expand Up @@ -149,7 +149,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
}

at::cuda::getCurrentCUDAStream().synchronize();
COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());
// COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());

if (!driver_) {
return std::nullopt;
Expand All @@ -166,7 +166,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
sampling_params.unique_token_ids,
sampling_params.unique_token_counts,
sampling_params.unique_token_ids_lens);
COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());
// COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());

// set logits to output
output.logits = logits;
Expand All @@ -179,7 +179,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
auto sample_logits =
logits.index_select(/*dim=*/0, sampling_params.sample_idxes);
auto sample_output = sampler->forward(sample_logits);
COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());
// COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());

// set sample output to output
output.sample_output = sample_output;
Expand Down
154 changes: 77 additions & 77 deletions src/handlers/vlm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,86 +20,86 @@
#include "request/request.h"
#include "speculative/speculative_engine.h"

DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status");
DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}});
DEFINE_COUNTER_INSTANCE(request_cancelled,
request_status_total,
{{"code", "CANCELLED"}});
DEFINE_COUNTER_INSTANCE(request_unknown,
request_status_total,
{{"code", "UNKNOWN"}});
DEFINE_COUNTER_INSTANCE(request_invalid_argument,
request_status_total,
{{"code", "INVALID_ARGUMENT"}});
DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
request_status_total,
{{"code", "DEADLINE_EXCEEDED"}});
DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
request_status_total,
{{"code", "RESOURCE_EXHAUSTED"}});
DEFINE_COUNTER_INSTANCE(request_unauthenticated,
request_status_total,
{{"code", "UNAUTHENTICATED"}});
DEFINE_COUNTER_INSTANCE(request_unavailable,
request_status_total,
{{"code", "UNAVAILABLE"}});
DEFINE_COUNTER_INSTANCE(request_unimplemented,
request_status_total,
{{"code", "UNIMPLEMENTED"}});

DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
"Request handling latency in seconds");
DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "chat"}});
DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "completion"}});

DEFINE_COUNTER(tokenization_latency_seconds,
"Prompt tokenization latency in seconds");
DEFINE_COUNTER(chat_template_latency_seconds,
"Chat template latency in seconds");
// DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request
// status"); DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code",
// "OK"}}); DEFINE_COUNTER_INSTANCE(request_cancelled,
// request_status_total,
// {{"code", "CANCELLED"}});
// DEFINE_COUNTER_INSTANCE(request_unknown,
// request_status_total,
// {{"code", "UNKNOWN"}});
// DEFINE_COUNTER_INSTANCE(request_invalid_argument,
// request_status_total,
// {{"code", "INVALID_ARGUMENT"}});
// DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
// request_status_total,
// {{"code", "DEADLINE_EXCEEDED"}});
// DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
// request_status_total,
// {{"code", "RESOURCE_EXHAUSTED"}});
// DEFINE_COUNTER_INSTANCE(request_unauthenticated,
// request_status_total,
// {{"code", "UNAUTHENTICATED"}});
// DEFINE_COUNTER_INSTANCE(request_unavailable,
// request_status_total,
// {{"code", "UNAVAILABLE"}});
// DEFINE_COUNTER_INSTANCE(request_unimplemented,
// request_status_total,
// {{"code", "UNIMPLEMENTED"}});

// DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
// "Request handling latency in seconds");
// DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
// request_handling_latency_seconds,
// {{"type", "chat"}});
// DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
// request_handling_latency_seconds,
// {{"type", "completion"}});

// DEFINE_COUNTER(tokenization_latency_seconds,
// "Prompt tokenization latency in seconds");
// DEFINE_COUNTER(chat_template_latency_seconds,
// "Chat template latency in seconds");

namespace llm {
namespace {

#define CALLBACK_WITH_ERROR(CODE, MSG) callback(Status{CODE, MSG});

void log_request_status(StatusCode code) {
switch (code) {
case StatusCode::OK:
COUNTER_INC(request_ok);
break;
case StatusCode::CANCELLED:
COUNTER_INC(request_cancelled);
break;
case StatusCode::UNKNOWN:
COUNTER_INC(request_unknown);
break;
case StatusCode::INVALID_ARGUMENT:
COUNTER_INC(request_invalid_argument);
break;
case StatusCode::DEADLINE_EXCEEDED:
COUNTER_INC(request_deadline_exceeded);
break;
case StatusCode::RESOURCE_EXHAUSTED:
COUNTER_INC(request_resource_exhausted);
break;
case StatusCode::UNAUTHENTICATED:
COUNTER_INC(request_unauthenticated);
break;
case StatusCode::UNAVAILABLE:
COUNTER_INC(request_unavailable);
break;
case StatusCode::UNIMPLEMENTED:
COUNTER_INC(request_unimplemented);
break;
default:
COUNTER_INC(request_unknown);
break;
}
}
// void log_request_status(StatusCode code) {
// switch (code) {
// case StatusCode::OK:
// COUNTER_INC(request_ok);
// break;
// case StatusCode::CANCELLED:
// COUNTER_INC(request_cancelled);
// break;
// case StatusCode::UNKNOWN:
// COUNTER_INC(request_unknown);
// break;
// case StatusCode::INVALID_ARGUMENT:
// COUNTER_INC(request_invalid_argument);
// break;
// case StatusCode::DEADLINE_EXCEEDED:
// COUNTER_INC(request_deadline_exceeded);
// break;
// case StatusCode::RESOURCE_EXHAUSTED:
// COUNTER_INC(request_resource_exhausted);
// break;
// case StatusCode::UNAUTHENTICATED:
// COUNTER_INC(request_unauthenticated);
// break;
// case StatusCode::UNAVAILABLE:
// COUNTER_INC(request_unavailable);
// break;
// case StatusCode::UNIMPLEMENTED:
// COUNTER_INC(request_unimplemented);
// break;
// default:
// COUNTER_INC(request_unknown);
// break;
// }
// }

bool verify_params(const SamplingParams& sp, OutputCallback callback) {
if (sp.n == 0) {
Expand Down Expand Up @@ -220,7 +220,7 @@ std::future<bool> VLMHandler::schedule_async(torch::Tensor image,
stream,
[callback = std::move(callback)](const RequestOutput& output) {
if (output.status.has_value()) {
log_request_status(output.status.value().code());
// log_request_status(output.status.value().code());
}
return callback(output);
});
Expand All @@ -243,7 +243,7 @@ std::future<bool> VLMHandler::schedule(torch::Tensor image,
priority,
stream,
callback = std::move(callback)](size_t tid) mutable {
AUTO_COUNTER(completion_handling_latency_seconds);
// AUTO_COUNTER(completion_handling_latency_seconds);

// remove the pending request after scheduling
SCOPE_GUARD([this] { scheduler_->dec_pending_requests(); });
Expand Down Expand Up @@ -343,7 +343,7 @@ std::unique_ptr<Request> VLMHandler::create_request(size_t tid,
"Failed to encode prompt");
return nullptr;
}
COUNTER_ADD(tokenization_latency_seconds, timer.elapsed_seconds());
// COUNTER_ADD(tokenization_latency_seconds, timer.elapsed_seconds());

// encode the image, encode & projector
auto vision_engine = dynamic_cast<VisionEngine*>(engine_.get());
Expand Down

0 comments on commit d711c55

Please sign in to comment.