@@ -1185,6 +1185,14 @@ struct vk_staging_memcpy {
11851185 size_t n;
11861186};
11871187
1188+ struct vk_staging_memset {
1189+ vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {}
1190+
1191+ void * dst;
1192+ uint32_t val;
1193+ size_t n;
1194+ };
1195+
11881196struct vk_context_struct {
11891197 vk_submission * s;
11901198 std::vector<vk_sequence> seqs;
@@ -1193,6 +1201,7 @@ struct vk_context_struct {
11931201
11941202 std::vector<vk_staging_memcpy> in_memcpys;
11951203 std::vector<vk_staging_memcpy> out_memcpys;
1204+ std::vector<vk_staging_memset> memsets;
11961205
11971206 vk_command_pool * p {};
11981207};
@@ -5194,6 +5203,14 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
51945203 }
51955204}
51965205
5206+ static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr) {
5207+ if (memsets == nullptr) {
5208+ memset(dst, val, size);
5209+ } else {
5210+ memsets->emplace_back(dst, val, size);
5211+ }
5212+ }
5213+
51975214static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
51985215 if (device->sync_staging == nullptr || device->sync_staging->size < size) {
51995216 VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
@@ -5389,6 +5406,10 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
53895406 memcpy(cpy.dst, cpy.src, cpy.n);
53905407 }
53915408
5409+ for (auto& mset : subctx->memsets) {
5410+ memset(mset.dst, mset.val, mset.n);
5411+ }
5412+
53925413 ggml_vk_submit(subctx, dst->device->fence);
53935414 VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
53945415 dst->device->device.resetFences({ dst->device->fence });
@@ -5528,6 +5549,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
55285549static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
55295550 VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
55305551
5552+ if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
5553+ dst->device->uma) {
5554+ deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
5555+ return;
5556+ }
5557+
5558+ // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
55315559 ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
55325560}
55335561
@@ -11174,6 +11202,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1117411202 memcpy(cpy.dst, cpy.src, cpy.n);
1117511203 }
1117611204
11205+ for (auto& mset : subctx->memsets) {
11206+ memset(mset.dst, mset.val, mset.n);
11207+ }
11208+
1117711209 if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
1117811210 ggml_vk_submit(subctx, ctx->almost_ready_fence);
1117911211 ctx->almost_ready_fence_pending = true;
@@ -11196,6 +11228,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1119611228 }
1119711229 subctx->in_memcpys.clear();
1119811230 subctx->out_memcpys.clear();
11231+ subctx->memsets.clear();
1119911232 }
1120011233
1120111234 return true;
0 commit comments