From 9ed1957cd307c8ff61dc04d11f932d5940ef39bd Mon Sep 17 00:00:00 2001 From: Vlad Kuznetsov Date: Tue, 20 Aug 2024 19:17:40 +0200 Subject: [PATCH 1/4] Add more introspection into PDisk's ChunkWrite and LogWrite (#7941) Co-authored-by: Vlad Kuznecov --- .../lwtrace_probes/blobstorage_probes.h | 22 +++++++++----- .../blobstorage_pdisk_blockdevice_async.cpp | 30 +++++++++++-------- .../pdisk/blobstorage_pdisk_completion.h | 1 + .../blobstorage_pdisk_completion_impl.cpp | 6 ++-- .../pdisk/blobstorage_pdisk_impl.cpp | 5 ++-- .../pdisk/blobstorage_pdisk_mon.cpp | 1 + .../blobstorage/pdisk/blobstorage_pdisk_mon.h | 1 + 7 files changed, 42 insertions(+), 24 deletions(-) diff --git a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h index 766e2d751483..375d0161d11d 100644 --- a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h +++ b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h @@ -207,13 +207,13 @@ struct TEventTypeField { NAMES("pdisk", "schedStep")) \ PROBE(PDiskChunkReadPieceAddToScheduler, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui32, ui64, ui64), \ - NAMES("pdisk", "pieceIdx", "size", "offset")) \ - PROBE(PDiskChunkReadPieceSendToDevice, GROUPS("PDisk", "PDiskRequest"), \ - TYPES(TPDiskIdField, ui64), \ - NAMES("pdisk", "size")) \ + NAMES("pdisk", "pieceIdx", "offset", "size")) \ + PROBE(PDiskChunkReadPiecesSendToDevice, GROUPS("PDisk", "PDiskRequest"), \ + TYPES(TPDiskIdField), \ + NAMES("pdisk")) \ PROBE(PDiskChunkReadPieceComplete, GROUPS("PDisk", "PDiskRequest"), \ - TYPES(TPDiskIdField, ui64, ui64), \ - NAMES("pdisk", "size", "relativeOffset")) \ + TYPES(TPDiskIdField, ui64, ui64, double), \ + NAMES("pdisk", "size", "relativeOffset", "deviceTimeMs")) \ PROBE(PDiskAddWritePieceToScheduler, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, double, ui64, bool, ui64, ui64), \ NAMES("pdisk", "reqId", "creationTimeSec", "owner", "isFast", "priorityClass", "size")) \ @@ -221,8 +221,8 @@ struct TEventTypeField { TYPES(TPDiskIdField, ui64, ui64, ui64, ui64), \ NAMES("pdisk", "owner", "chunkIdx", "pieceOffset", "pieceSize")) \ PROBE(PDiskLogWriteComplete, GROUPS("PDisk", "PDiskRequest"), \ - TYPES(TPDiskIdField, ui64, double, double, double, double, double, double), \ - NAMES("pdisk", "reqId", "creationTimeSec", "costMs", "responseTimeMs", "inputTimeMs", "scheduleTimeMs", "deviceTotalTimeMs")) \ + TYPES(TPDiskIdField, ui64, double, double, double, double, double, double, double), \ + NAMES("pdisk", "reqId", "creationTimeSec", "costMs", "responseTimeMs", "inputTimeMs", "scheduleTimeMs", "deviceTotalTimeMs", "deviceOnlyTimeMs")) \ PROBE(PDiskChunkResponseTime, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, ui64, double, ui64), \ NAMES("pdisk", "reqId", "priorityClass", "responseTimeMs", "sizeBytes")) \ @@ -235,6 +235,12 @@ struct TEventTypeField { PROBE(PDiskDeviceWriteDuration, GROUPS("PDisk"), \ TYPES(TPDiskIdField, double, ui64), \ NAMES("pdisk", "deviceTimeMs", "size")) \ + PROBE(PDiskDeviceGetFromDevice, GROUPS("PDisk"), \ + TYPES(), \ + NAMES()) \ + PROBE(PDiskDeviceGetFromWaiting, GROUPS("PDisk"), \ + TYPES(), \ + NAMES()) \ PROBE(PDiskDeviceTrimDuration, GROUPS("PDisk"), \ TYPES(TPDiskIdField, double, ui64), \ NAMES("pdisk", "trimTimeMs", "trimOffset")) \ diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp index 7f2792c639de..2582cf639cb2 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp @@ -334,8 +334,8 @@ class TRealBlockDevice : public IBlockDevice { class TSharedCallback : public ICallback { ui64 NextPossibleNoop = 0; ui64 EndOffset = 0; - ui64 PrevEventGotAtCycle = HPNow(); - ui64 PrevEstimationAtCycle = HPNow(); + NHPTimer::STime PrevEventGotAtCycle = HPNow(); + NHPTimer::STime PrevEstimationAtCycle = HPNow(); ui64 PrevEstimatedCostNs = 0; ui64 PrevActualCostNs = 0; @@ -366,29 +366,32 @@ class TRealBlockDevice : public IBlockDevice { void Exec(TAsyncIoOperationResult *event) { IAsyncIoOperation *op = event->Operation; // Add up the execution time of all the events - ui64 totalExecutionCycles = 0; - ui64 totalCostNs = 0; - ui64 eventGotAtCycle = HPNow(); + NHPTimer::STime eventGotAtCycle = HPNow(); AtomicSet(Device.Mon.LastDoneOperationTimestamp, eventGotAtCycle); TCompletionAction *completionAction = static_cast(op->GetCookie()); FillCompletionAction(completionAction, op, event->Result); + completionAction->GetTime = eventGotAtCycle; + LWTRACK(PDiskDeviceGetFromDevice, completionAction->Orbit); + if (completionAction->FlushAction) { + completionAction->FlushAction->GetTime = eventGotAtCycle; + LWTRACK(PDiskDeviceGetFromDevice, completionAction->FlushAction->Orbit); + } Device.QuitCounter.Decrement(); Device.IdleCounter.Decrement(); Device.FlightControl.MarkComplete(completionAction->OperationIdx); - ui64 startCycle = Max((ui64)completionAction->SubmitTime, PrevEventGotAtCycle); - ui64 durationCycles = (eventGotAtCycle > startCycle) ? eventGotAtCycle - startCycle : 0; - totalExecutionCycles = Max(totalExecutionCycles, durationCycles); - totalCostNs += completionAction->CostNs; + NHPTimer::STime startCycle = Max(completionAction->SubmitTime, (i64)PrevEventGotAtCycle); + NHPTimer::STime durationCycles = (eventGotAtCycle > startCycle) ? eventGotAtCycle - startCycle : 0; + NHPTimer::STime totalExecutionCycles = durationCycles; + NHPTimer::STime totalCostNs = completionAction->CostNs; - bool isSeekExpected = - ((ui64)completionAction->SubmitTime + Device.SeekCostNs / 25ull >= PrevEventGotAtCycle); + bool isSeekExpected = (completionAction->SubmitTime + (NHPTimer::STime)Device.SeekCostNs / 25ll >= PrevEventGotAtCycle); const ui64 opSize = op->GetSize(); Device.DecrementMonInFlight(op->GetType(), opSize); - if (opSize == 0) { + if (opSize == 0) { // Special case for flush operation, which is a read operation with 0 bytes size if (op->GetType() == IAsyncIoOperation::EType::PRead) { Y_ABORT_UNLESS(WaitingNoops[completionAction->OperationIdx % MaxWaitingNoops] == nullptr); WaitingNoops[completionAction->OperationIdx % MaxWaitingNoops] = completionAction; @@ -433,6 +436,9 @@ class TRealBlockDevice : public IBlockDevice { while (NextPossibleNoop < firstIncompleteIdx) { ui64 i = NextPossibleNoop % MaxWaitingNoops; if (WaitingNoops[i] && WaitingNoops[i]->OperationIdx == NextPossibleNoop) { + LWTRACK(PDiskDeviceGetFromWaiting, WaitingNoops[i]->Orbit); + double durationMs = HPMilliSecondsFloat(HPNow() - WaitingNoops[i]->GetTime); + Device.Mon.DeviceFlushDuration.Increment(durationMs); Device.CompletionThread->Schedule(WaitingNoops[i]); WaitingNoops[i] = nullptr; } diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion.h index d2599fdb9456..d066cd14fd27 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion.h @@ -12,6 +12,7 @@ namespace NKikimr::NPDisk { struct TCompletionAction { ui64 OperationIdx; NHPTimer::STime SubmitTime; + NHPTimer::STime GetTime; TCompletionAction *FlushAction = nullptr; ui64 CostNs = 0; NWilson::TTraceId TraceId; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp index 9039624714c2..8e9d5cb04a71 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp @@ -53,7 +53,8 @@ void TCompletionLogWrite::Exec(TActorSystem *actorSystem) { HPMilliSecondsFloat(now - evLog.CreationTime), HPMilliSecondsFloat(evLog.InputTime - evLog.CreationTime), HPMilliSecondsFloat(evLog.ScheduleTime - evLog.InputTime), - HPMilliSecondsFloat(now - evLog.ScheduleTime)); + HPMilliSecondsFloat(now - evLog.ScheduleTime), + HPMilliSecondsFloat(GetTime - SubmitTime)); if (evLog.Result->Results) { evLog.Result->Results.front().Orbit = std::move(evLog.Orbit); } @@ -275,7 +276,8 @@ void TCompletionChunkReadPart::Exec(TActorSystem *actorSystem) { endBadUserOffset = 0xffffffff; } - LWTRACK(PDiskChunkReadPieceComplete, Read->Orbit, PDisk->PDiskId, RawReadSize, CommonBufferOffset); + double deviceTimeMs = HPMilliSecondsFloat(GetTime - SubmitTime); + LWTRACK(PDiskChunkReadPieceComplete, Read->Orbit, PDisk->PDiskId, RawReadSize, CommonBufferOffset, deviceTimeMs); CumulativeCompletion->PartReadComplete(actorSystem); CumulativeCompletion = nullptr; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index fc0c597a99ca..8aa491f18c12 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -2268,6 +2268,7 @@ void TPDisk::ProcessChunkReadQueue() { bool isComplete = false; ui8 priorityClass = read->PriorityClass; NHPTimer::STime creationTime = read->CreationTime; + LWTRACK(PDiskChunkReadPiecesSendToDevice, read->Orbit, PDiskId); if (!read->IsReplied) { LOG_DEBUG_S(*ActorSystem, NKikimrServices::BS_PDISK, "PDiskId# " << (ui32)PDiskId << " ReqId# " << reqId @@ -3206,7 +3207,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { // Schedule small job. auto piece = new TChunkReadPiece(read, idx * smallJobSize, smallJobSize * Format.SectorSize, false, std::move(span)); - LWTRACK(PDiskChunkReadPieceAddToScheduler, read->Orbit, PDiskId, idx, idx * smallJobSize, + LWTRACK(PDiskChunkReadPieceAddToScheduler, read->Orbit, PDiskId, idx, idx * smallJobSize * Format.SectorSize, smallJobSize * Format.SectorSize); piece->EstimateCost(DriveModel); piece->SelfPointer = piece; @@ -3218,7 +3219,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { auto piece = new TChunkReadPiece(read, smallJobCount * smallJobSize, largeJobSize * Format.SectorSize, true, std::move(span)); LWTRACK(PDiskChunkReadPieceAddToScheduler, read->Orbit, PDiskId, smallJobCount, - smallJobCount * smallJobSize, largeJobSize * Format.SectorSize); + smallJobCount * smallJobSize * Format.SectorSize, largeJobSize * Format.SectorSize); piece->EstimateCost(DriveModel); piece->SelfPointer = piece; AddJobToForseti(cbs, piece, request->JobKind); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp index 1dd8385b8d8a..bb3bbdaa0bc9 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp @@ -142,6 +142,7 @@ TPDiskMon::TPDiskMon(const TIntrusivePtr<::NMonitoring::TDynamicCounters>& count HISTOGRAM_INIT(DeviceReadDuration, deviceReadDuration); HISTOGRAM_INIT(DeviceWriteDuration, deviceWriteDuration); HISTOGRAM_INIT(DeviceTrimDuration, deviceTrimDuration); + HISTOGRAM_INIT(DeviceFlushDuration, deviceFlushDuration); TRACKER_INIT_IF_EXTENDED(LogQueueTime, logQueueTime, Time in millisec); TRACKER_INIT_IF_EXTENDED(GetQueueSyncLog, getQueueSyncLog, Time in millisec); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h index 4b978bfcec03..c3336452d526 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h @@ -302,6 +302,7 @@ struct TPDiskMon { THistogram DeviceReadDuration; THistogram DeviceWriteDuration; THistogram DeviceTrimDuration; + THistogram DeviceFlushDuration; // using TDurationTracker = NMonitoring::TPercentileTrackerLg<5, 4, 15>; From 17138fe8321881cc6dde0ea058a4aeb516e04a65 Mon Sep 17 00:00:00 2001 From: Vlad Kuznetsov Date: Thu, 22 Aug 2024 18:40:49 +0200 Subject: [PATCH 2/4] PDisk LWTracing improvements (#8160) Co-authored-by: Vlad Kuznecov --- .../lwtrace_probes/blobstorage_probes.h | 4 ++-- .../blobstorage_pdisk_blockdevice_async.cpp | 6 +++++ .../blobstorage_pdisk_completion_impl.cpp | 3 ++- .../pdisk/blobstorage_pdisk_completion_impl.h | 1 + .../pdisk/blobstorage_pdisk_impl.cpp | 22 ++++++++++--------- .../pdisk/blobstorage_pdisk_impl.h | 2 +- .../pdisk/blobstorage_pdisk_impl_log.cpp | 20 ++++++++++++----- 7 files changed, 38 insertions(+), 20 deletions(-) diff --git a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h index 375d0161d11d..49ed0cacfe9e 100644 --- a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h +++ b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h @@ -214,10 +214,10 @@ struct TEventTypeField { PROBE(PDiskChunkReadPieceComplete, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, ui64, double), \ NAMES("pdisk", "size", "relativeOffset", "deviceTimeMs")) \ - PROBE(PDiskAddWritePieceToScheduler, GROUPS("PDisk", "PDiskRequest"), \ + PROBE(PDiskChunkWriteAddToScheduler, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, double, ui64, bool, ui64, ui64), \ NAMES("pdisk", "reqId", "creationTimeSec", "owner", "isFast", "priorityClass", "size")) \ - PROBE(PDiskChunkWritePieceSendToDevice, GROUPS("PDisk", "PDiskRequest"), \ + PROBE(PDiskChunkWriteLastPieceSendToDevice, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, ui64, ui64, ui64), \ NAMES("pdisk", "owner", "chunkIdx", "pieceOffset", "pieceSize")) \ PROBE(PDiskLogWriteComplete, GROUPS("PDisk", "PDiskRequest"), \ diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp index 2582cf639cb2..53577a05bd9b 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_blockdevice_async.cpp @@ -242,6 +242,9 @@ class TRealBlockDevice : public IBlockDevice { EIoResult ret = EIoResult::TryAgain; while (ret == EIoResult::TryAgain) { action->SubmitTime = HPNow(); + if (action->FlushAction) { + action->FlushAction->SubmitTime = action->SubmitTime; + } if (op->GetType() == IAsyncIoOperation::EType::PWrite) { PDISK_FAIL_INJECTION(1); @@ -563,6 +566,9 @@ class TRealBlockDevice : public IBlockDevice { EIoResult ret = EIoResult::TryAgain; while (ret == EIoResult::TryAgain) { action->SubmitTime = HPNow(); + if (action->FlushAction) { + action->FlushAction->SubmitTime = action->SubmitTime; + } ret = Device.IoContext->Submit(op, Device.SharedCallback.Get()); if (ret == EIoResult::Ok) { return true; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp index 8e9d5cb04a71..27e8fb697d4d 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp @@ -277,7 +277,8 @@ void TCompletionChunkReadPart::Exec(TActorSystem *actorSystem) { } double deviceTimeMs = HPMilliSecondsFloat(GetTime - SubmitTime); - LWTRACK(PDiskChunkReadPieceComplete, Read->Orbit, PDisk->PDiskId, RawReadSize, CommonBufferOffset, deviceTimeMs); + LWTRACK(PDiskChunkReadPieceComplete, Orbit, PDisk->PDiskId, RawReadSize, CommonBufferOffset, deviceTimeMs); + Read->Orbit.Join(Orbit); CumulativeCompletion->PartReadComplete(actorSystem); CumulativeCompletion = nullptr; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h index 5ae63cc4384f..feb626f6fc82 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.h @@ -111,6 +111,7 @@ class TCompletionChunkWrite : public TCompletionAction { Mon->IncrementResponseTime(PriorityClass, responseTimeMs, SizeBytes); } LWTRACK(PDiskChunkResponseTime, Orbit, PDiskId, ReqId.Id, PriorityClass, responseTimeMs, SizeBytes); + Event->Orbit = std::move(Orbit); actorSystem->Send(Recipient, Event.Release()); if (Mon) { Mon->GetWriteCounter(PriorityClass)->CountResponse(); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index 8aa491f18c12..40479eddd76d 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -843,9 +843,6 @@ bool TPDisk::ChunkWritePiece(TChunkWrite *evChunkWrite, ui32 pieceShift, ui32 pi guard.Release(); - LWTRACK(PDiskChunkWritePieceSendToDevice, evChunkWrite->Orbit, PDiskId, evChunkWrite->Owner, chunkIdx, - pieceShift, pieceSize); - ui32 bytesAvailable = pieceSize; Y_ABORT_UNLESS(evChunkWrite->BytesWritten == pieceShift); const ui32 count = evChunkWrite->PartsPtr->Size(); @@ -904,6 +901,9 @@ bool TPDisk::ChunkWritePiece(TChunkWrite *evChunkWrite, ui32 pieceShift, ui32 pi LOG_INFO(*ActorSystem, NKikimrServices::BS_PDISK, "PDiskId# %" PRIu32 " chunkIdx# %" PRIu32 " was zero-padded after writing", (ui32)PDiskId, (ui32)chunkIdx); } + LWTRACK(PDiskChunkWriteLastPieceSendToDevice, evChunkWrite->Orbit, PDiskId, evChunkWrite->Owner, chunkIdx, + pieceShift, pieceSize); + auto traceId = evChunkWrite->SpanStack.GetTraceId(); evChunkWrite->Completion->Orbit = std::move(evChunkWrite->Orbit); writer.Flush(evChunkWrite->ReqId, &traceId, evChunkWrite->Completion.Release()); @@ -946,7 +946,7 @@ void TPDisk::SendChunkReadError(const TIntrusivePtr& read, TStringSt } TPDisk::EChunkReadPieceResult TPDisk::ChunkReadPiece(TIntrusivePtr &read, ui64 pieceCurrentSector, - ui64 pieceSizeLimit, ui64 *reallyReadDiskBytes, NWilson::TTraceId traceId) { + ui64 pieceSizeLimit, ui64 *reallyReadDiskBytes, NWilson::TTraceId traceId, NLWTrace::TOrbit&& orbit) { if (read->IsReplied) { return ReadPieceResultOk; } @@ -1015,6 +1015,8 @@ TPDisk::EChunkReadPieceResult TPDisk::ChunkReadPiece(TIntrusivePtr & THolder completion(new TCompletionChunkReadPart(this, read, bytesToRead, payloadBytesToRead, payloadOffset, read->FinalCompletion, isTheLastPart, Cfg->UseT1ha0HashInFooter, std::move(span))); completion->CostNs = DriveModel.TimeForSizeNs(bytesToRead, read->ChunkIdx, TDriveModel::OP_TYPE_READ); + LWTRACK(PDiskChunkReadPiecesSendToDevice, orbit, PDiskId); + completion->Orbit = std::move(orbit); Y_ABORT_UNLESS(bytesToRead <= completion->GetBuffer()->Size()); ui8 *data = completion->GetBuffer()->Data(); BlockDevice->PreadAsync(data, bytesToRead, readOffset, completion.Release(), @@ -2268,7 +2270,6 @@ void TPDisk::ProcessChunkReadQueue() { bool isComplete = false; ui8 priorityClass = read->PriorityClass; NHPTimer::STime creationTime = read->CreationTime; - LWTRACK(PDiskChunkReadPiecesSendToDevice, read->Orbit, PDiskId); if (!read->IsReplied) { LOG_DEBUG_S(*ActorSystem, NKikimrServices::BS_PDISK, "PDiskId# " << (ui32)PDiskId << " ReqId# " << reqId @@ -2280,7 +2281,7 @@ void TPDisk::ProcessChunkReadQueue() { ui64 currentLimit = Min(bufferSize, piece->PieceSizeLimit - size); ui64 reallyReadDiskBytes; EChunkReadPieceResult result = ChunkReadPiece(read, piece->PieceCurrentSector + size / Format.SectorSize, - currentLimit, &reallyReadDiskBytes, piece->SpanStack.GetTraceId()); + currentLimit, &reallyReadDiskBytes, piece->SpanStack.GetTraceId(), std::move(piece->Orbit)); isComplete = (result != ReadPieceResultInProgress); // Read pieces is sliced previously and it is expected that ChunkReadPiece will read exactly // currentLimit bytes @@ -2942,7 +2943,6 @@ bool TPDisk::PreprocessRequest(TRequestBase *request) { auto result = std::make_unique(NKikimrProto::OK, ev.ChunkIdx, ev.Cookie, GetStatusFlags(ev.Owner, ev.OwnerGroupType), TString()); - result->Orbit = std::move(ev.Orbit); ++state.OperationsInProgress; ++ownerData.InFlight->ChunkWrites; @@ -3186,7 +3186,7 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { TChunkWritePiece *piece = new TChunkWritePiece(whole, smallJobCount * smallJobSize, largeJobSize, std::move(span)); piece->EstimateCost(DriveModel); AddJobToForseti(cbs, piece, request->JobKind); - LWTRACK(PDiskAddWritePieceToScheduler, request->Orbit, PDiskId, request->ReqId.Id, + LWTRACK(PDiskChunkWriteAddToScheduler, request->Orbit, PDiskId, request->ReqId.Id, HPSecondsFloat(HPNow() - request->CreationTime), request->Owner, request->IsFast, request->PriorityClass, whole->TotalSize); } else if (request->GetType() == ERequestType::RequestChunkRead) { @@ -3207,7 +3207,8 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { // Schedule small job. auto piece = new TChunkReadPiece(read, idx * smallJobSize, smallJobSize * Format.SectorSize, false, std::move(span)); - LWTRACK(PDiskChunkReadPieceAddToScheduler, read->Orbit, PDiskId, idx, idx * smallJobSize * Format.SectorSize, + read->Orbit.Fork(piece->Orbit); + LWTRACK(PDiskChunkReadPieceAddToScheduler, piece->Orbit, PDiskId, idx, idx * smallJobSize * Format.SectorSize, smallJobSize * Format.SectorSize); piece->EstimateCost(DriveModel); piece->SelfPointer = piece; @@ -3218,7 +3219,8 @@ void TPDisk::PushRequestToForseti(TRequestBase *request) { span.Attribute("is_last_piece", true); auto piece = new TChunkReadPiece(read, smallJobCount * smallJobSize, largeJobSize * Format.SectorSize, true, std::move(span)); - LWTRACK(PDiskChunkReadPieceAddToScheduler, read->Orbit, PDiskId, smallJobCount, + read->Orbit.Fork(piece->Orbit); + LWTRACK(PDiskChunkReadPieceAddToScheduler, piece->Orbit, PDiskId, smallJobCount, smallJobCount * smallJobSize * Format.SectorSize, largeJobSize * Format.SectorSize); piece->EstimateCost(DriveModel); piece->SelfPointer = piece; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.h index ffc694049664..cbe7f6972988 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.h @@ -288,7 +288,7 @@ class TPDisk : public IPDisk { void SendChunkReadError(const TIntrusivePtr& read, TStringStream& errorReason, NKikimrProto::EReplyStatus status); EChunkReadPieceResult ChunkReadPiece(TIntrusivePtr &read, ui64 pieceCurrentSector, ui64 pieceSizeLimit, - ui64 *reallyReadBytes, NWilson::TTraceId traceId); + ui64 *reallyReadBytes, NWilson::TTraceId traceId, NLWTrace::TOrbit&& orbit); void SplitChunkJobSize(ui32 totalSize, ui32 *outSmallJobSize, ui32 *outLargeJObSize, ui32 *outSmallJobCount); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Chunk locking diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp index fe4778e6c2ef..c5e0fbfbd980 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp @@ -19,23 +19,31 @@ class TLogFlushCompletionAction : public TCompletionAction { : EndChunkIdx(endChunkIdx) , EndSectorIdx(endSectorIdx) , CommonLogger(commonLogger) - , CompletionLogWrite(completionLogWrite) { } + , CompletionLogWrite(completionLogWrite) + { + Orbit = std::move(completionLogWrite->Orbit); + } + + void SetUpCompletionLogWrite() { + CompletionLogWrite->SubmitTime = SubmitTime; + CompletionLogWrite->GetTime = GetTime; + CompletionLogWrite->SetResult(Result); + CompletionLogWrite->SetErrorReason(ErrorReason); + CompletionLogWrite->Orbit = std::move(Orbit); + } void Exec(TActorSystem *actorSystem) override { CommonLogger->FirstUncommitted = TFirstUncommitted(EndChunkIdx, EndSectorIdx); - CompletionLogWrite->SetResult(Result); - CompletionLogWrite->SetErrorReason(ErrorReason); + SetUpCompletionLogWrite(); CompletionLogWrite->Exec(actorSystem); delete this; } void Release(TActorSystem *actorSystem) override { - CompletionLogWrite->SetResult(Result); - CompletionLogWrite->SetErrorReason(ErrorReason); + SetUpCompletionLogWrite(); CompletionLogWrite->Release(actorSystem); - delete this; } }; From a81c3e39b18b8b177318a616820a5d000fe03f83 Mon Sep 17 00:00:00 2001 From: Vlad Kuznetsov Date: Thu, 12 Sep 2024 18:40:05 +0200 Subject: [PATCH 3/4] Limit PDisk's batches and improve LWTrace around PDisk's event loop (#9099) Co-authored-by: Vlad Kuznecov --- .../lwtrace_probes/blobstorage_probes.h | 11 +++++++---- .../pdisk/blobstorage_pdisk_completion_impl.cpp | 7 +++++-- .../pdisk/blobstorage_pdisk_impl.cpp | 17 +++++++++++++++++ .../pdisk/blobstorage_pdisk_impl_log.cpp | 2 +- .../blobstorage/pdisk/blobstorage_pdisk_mon.cpp | 1 + .../blobstorage/pdisk/blobstorage_pdisk_mon.h | 8 +++++++- 6 files changed, 38 insertions(+), 8 deletions(-) diff --git a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h index 49ed0cacfe9e..bf91ace04e2b 100644 --- a/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h +++ b/ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h @@ -221,8 +221,8 @@ struct TEventTypeField { TYPES(TPDiskIdField, ui64, ui64, ui64, ui64), \ NAMES("pdisk", "owner", "chunkIdx", "pieceOffset", "pieceSize")) \ PROBE(PDiskLogWriteComplete, GROUPS("PDisk", "PDiskRequest"), \ - TYPES(TPDiskIdField, ui64, double, double, double, double, double, double, double), \ - NAMES("pdisk", "reqId", "creationTimeSec", "costMs", "responseTimeMs", "inputTimeMs", "scheduleTimeMs", "deviceTotalTimeMs", "deviceOnlyTimeMs")) \ + TYPES(TPDiskIdField, ui64, double, double, double, double, double, double, double, ui64), \ + NAMES("pdisk", "reqId", "creationTimeSec", "costMs", "responseTimeMs", "inputTimeMs", "scheduleTimeMs", "deviceTotalTimeMs", "deviceOnlyTimeMs", "batchSize")) \ PROBE(PDiskChunkResponseTime, GROUPS("PDisk", "PDiskRequest"), \ TYPES(TPDiskIdField, ui64, ui64, double, ui64), \ NAMES("pdisk", "reqId", "priorityClass", "responseTimeMs", "sizeBytes")) \ @@ -261,8 +261,11 @@ struct TEventTypeField { TYPES(ui32, ui64, ui64), \ NAMES("chunkIdx", "size", "offset")) \ PROBE(PDiskUpdateCycleDetails, GROUPS("PDisk"), \ - TYPES(float, float, float, float, float), \ - NAMES("entireUpdateMs", "inputQueueMs", "schedulingMs", "processingMs", "waitingMs")) \ + TYPES(ui32, float, float, float, float, float), \ + NAMES("pdisk", "entireUpdateMs", "inputQueueMs", "schedulingMs", "processingMs", "waitingMs")) \ + PROBE(PDiskEnqueueAllDetails, GROUPS("PDisk"), \ + TYPES(ui64, size_t, size_t, size_t, double), \ + NAMES("pdisk", "initialQueueSize", "processedReqs", "pushedToForsetiReqs", "spentTimeMs")) \ PROBE(DSProxyGetEnqueue, GROUPS("DSProxy", "LWTrackStart"), TYPES(), NAMES()) \ PROBE(DSProxyGetBootstrap, GROUPS("DSProxy"), TYPES(), NAMES()) \ PROBE(DSProxyGetHandle, GROUPS("DSProxy", "LWTrackStart"), TYPES(), NAMES()) \ diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp index 27e8fb697d4d..8809e9868fbd 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_completion_impl.cpp @@ -2,6 +2,8 @@ #include "blobstorage_pdisk_impl.h" #include "blobstorage_pdisk_sectorrestorator.h" +constexpr size_t MAX_RESULTS_PER_BATCH = 50; // It took ~0.25ms in VDisk's handler to process such batch + namespace NKikimr { namespace NPDisk { @@ -54,7 +56,8 @@ void TCompletionLogWrite::Exec(TActorSystem *actorSystem) { HPMilliSecondsFloat(evLog.InputTime - evLog.CreationTime), HPMilliSecondsFloat(evLog.ScheduleTime - evLog.InputTime), HPMilliSecondsFloat(now - evLog.ScheduleTime), - HPMilliSecondsFloat(GetTime - SubmitTime)); + HPMilliSecondsFloat(GetTime - SubmitTime), + batch ? batch->Result->Results.size() : 0); if (evLog.Result->Results) { evLog.Result->Results.front().Orbit = std::move(evLog.Orbit); } @@ -66,7 +69,7 @@ void TCompletionLogWrite::Exec(TActorSystem *actorSystem) { } if (evLog.Result->Status == NKikimrProto::OK) { if (batch) { - if (batch->Sender == evLog.Sender) { + if (batch->Sender == evLog.Sender && batch->Result->Results.size() < MAX_RESULTS_PER_BATCH) { batch->Result->Results.push_back(std::move(evLog.Result->Results[0])); } else { sendResponse(batch); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index 40479eddd76d..617948c820e1 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -13,6 +13,8 @@ #include +constexpr size_t MAX_REQS_PER_CYCLE = 200; // 200 requests take ~0.2ms in EnqueueAll function + namespace NKikimr { namespace NPDisk { @@ -3409,7 +3411,14 @@ void TPDisk::ProcessYardInitSet() { } void TPDisk::EnqueueAll() { + TInstant start = TInstant::Now(); + TGuard guard(StateMutex); + size_t initialQueueSize = InputQueue.GetWaitingSize(); + size_t processedReqs = 0; + size_t pushedToForsetiReqs = 0; + + while (InputQueue.GetWaitingSize() > 0) { TRequestBase* request = InputQueue.Pop(); AtomicSub(InputQueueCost, request->Cost); @@ -3452,9 +3461,17 @@ void TPDisk::EnqueueAll() { } else { if (PreprocessRequest(request)) { PushRequestToForseti(request); + ++pushedToForsetiReqs; } } + ++processedReqs; + if (processedReqs >= MAX_REQS_PER_CYCLE) { + break; + } } + + double spentTimeMs = (TInstant::Now() - start).MillisecondsFloat(); + LWPROBE(PDiskEnqueueAllDetails, PCtx->PDiskId, initialQueueSize, processedReqs, pushedToForsetiReqs, spentTimeMs); } void TPDisk::Update() { diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp index c5e0fbfbd980..51430b496d62 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_log.cpp @@ -908,8 +908,8 @@ void TPDisk::LogWrite(TLogWrite &evLog, TVector &logChunksToCommit) { LOG_ERROR(*ActorSystem, NKikimrServices::BS_PDISK, "%s", str.Str().c_str()); evLog.Result.Reset(new NPDisk::TEvLogResult(NKikimrProto::OUT_OF_SPACE, NotEnoughDiskSpaceStatusFlags(evLog.Owner, evLog.OwnerGroupType), str.Str())); - evLog.Result->Results.push_back(NPDisk::TEvLogResult::TRecord(evLog.Lsn, evLog.Cookie)); Y_ABORT_UNLESS(evLog.Result.Get()); + evLog.Result->Results.push_back(NPDisk::TEvLogResult::TRecord(evLog.Lsn, evLog.Cookie)); return; } if (!CommonLogger->NextChunks.empty()) { diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp index bb3bbdaa0bc9..7061b4fa49ab 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.cpp @@ -120,6 +120,7 @@ TPDiskMon::TPDiskMon(const TIntrusivePtr<::NMonitoring::TDynamicCounters>& count COUNTER_INIT(DeviceGroup, DeviceIoErrors, true); COUNTER_INIT_IF_EXTENDED(DeviceGroup, DeviceWaitTimeMs, true); + UpdateDurationTracker.SetPDiskId(PDiskId); UpdateDurationTracker.SetCounter(DeviceGroup->GetCounter("PDiskThreadBusyTimeNs", true)); // queue subgroup diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h index c3336452d526..08a2f1d816ca 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h @@ -157,6 +157,8 @@ struct TPDiskMon { ::NMonitoring::TDynamicCounters::TCounterPtr PDiskThreadBusyTimeNs; + ui32 PDiskId = 0; + public: NMonitoring::TPercentileTrackerLg<5, 4, 15> UpdateCycleTime; @@ -165,6 +167,10 @@ struct TPDiskMon { : BeginUpdateAt(HPNow()) {} + void SetPDiskId(ui32 pdiskId) { + PDiskId = pdiskId; + } + void SetCounter(const ::NMonitoring::TDynamicCounters::TCounterPtr& pDiskThreadBusyTimeNs) { PDiskThreadBusyTimeNs = pDiskThreadBusyTimeNs; } @@ -208,7 +214,7 @@ struct TPDiskMon { float schedulingMs = HPMilliSecondsFloat(ProcessingStartAt - SchedulingStartAt); float processingMs = HPMilliSecondsFloat(WaitingStartAt - ProcessingStartAt); float waitingMs = HPMilliSecondsFloat(updateEndedAt - WaitingStartAt); - GLOBAL_LWPROBE(BLOBSTORAGE_PROVIDER, PDiskUpdateCycleDetails, entireUpdateMs, inputQueueMs, + GLOBAL_LWPROBE(BLOBSTORAGE_PROVIDER, PDiskUpdateCycleDetails, PDiskId, entireUpdateMs, inputQueueMs, schedulingMs, processingMs, waitingMs); } BeginUpdateAt = updateEndedAt; From 4704f4f4bf7ee651423e6ef4760b603064afd3ec Mon Sep 17 00:00:00 2001 From: Vlad Kuznecov Date: Thu, 12 Sep 2024 16:49:07 +0000 Subject: [PATCH 4/4] fix build due to usage of PCtx --- ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp index 617948c820e1..0443d97eb633 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl.cpp @@ -3471,7 +3471,7 @@ void TPDisk::EnqueueAll() { } double spentTimeMs = (TInstant::Now() - start).MillisecondsFloat(); - LWPROBE(PDiskEnqueueAllDetails, PCtx->PDiskId, initialQueueSize, processedReqs, pushedToForsetiReqs, spentTimeMs); + LWPROBE(PDiskEnqueueAllDetails, PDiskId, initialQueueSize, processedReqs, pushedToForsetiReqs, spentTimeMs); } void TPDisk::Update() {