From 2caca8fc7aad9ea9a6ea3ed26ed146b1e5f06fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:14:37 +0100 Subject: [PATCH 1/7] block: use page_to_phys in bvec_phys Use page_to_phys instead of open coding it now that it is available in an architecture independent way. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081437.798213-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f41c7f0ef91ed5..ba8f52d48b9445 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -286,12 +286,7 @@ static inline void *bvec_virt(struct bio_vec *bvec) */ static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) { - /* - * Note this open codes page_to_phys because page_to_phys is defined in - * , which we don't want to pull in here. If it ever moves to - * a sensible place we should start using it. - */ - return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; + return page_to_phys(bvec->bv_page) + bvec->bv_offset; } #endif /* __LINUX_BVEC_H */ From b7175e24d6acf79d9f3af9ce9d3d50de1fa748ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:15:29 +0100 Subject: [PATCH 2/7] block: add a dma mapping iterator blk_rq_map_sg is maze of nested loops. Untangle it by creating an iterator that returns [paddr,len] tuples for DMA mapping, and then implement the DMA logic on top of this. This not only removes code at the source level, but also generates nicer binary code: $ size block/blk-merge.o.* text data bss dec hex filename 10001 432 0 10433 28c1 block/blk-merge.o.new 10317 468 0 10785 2a21 block/blk-merge.o.old Last but not least it will be used as a building block for a new DMA mapping helper that doesn't rely on struct scatterlist. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081609.798289-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 177 ++++++++++++++++++---------------------------- 1 file changed, 70 insertions(+), 107 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e01383c6e534b0..15cd231d560cb4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -473,137 +473,100 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } -static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, - struct scatterlist *sglist) -{ - if (!*sg) - return sglist; +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; - /* - * If the driver previously mapped a shorter list, we could see a - * termination bit prematurely unless it fully inits the sg table - * on each mapping. We KNOW that there must be more entries here - * or the driver would be buggy, so force clear the termination bit - * to avoid doing a full sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - return sg_next(*sg); -} - -static unsigned blk_bvec_map_sg(struct request_queue *q, - struct bio_vec *bvec, struct scatterlist *sglist, - struct scatterlist **sg) +static bool blk_map_iter_next(struct request *req, + struct req_iterator *iter, struct phys_vec *vec) { - unsigned nbytes = bvec->bv_len; - unsigned nsegs = 0, total = 0; - - while (nbytes > 0) { - unsigned offset = bvec->bv_offset + total; - unsigned len = get_max_segment_size(&q->limits, - bvec_phys(bvec) + total, nbytes); - struct page *page = bvec->bv_page; - - /* - * Unfortunately a fair number of drivers barf on scatterlists - * that have an offset larger than PAGE_SIZE, despite other - * subsystems dealing with that invariant just fine. For now - * stick to the legacy format where we never present those from - * the block layer, but the code below should be removed once - * these offenders (mostly MMC/SD drivers) are fixed. - */ - page += (offset >> PAGE_SHIFT); - offset &= ~PAGE_MASK; - - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, page, len, offset); + unsigned int max_size; + struct bio_vec bv; - total += len; - nbytes -= len; - nsegs++; + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; } - return nsegs; -} - -static inline int __blk_bvec_map_sg(struct bio_vec bv, - struct scatterlist *sglist, struct scatterlist **sg) -{ - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); - return 1; -} - -/* only try to merge bvecs into one sg if they are from two bios */ -static inline bool -__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, - struct bio_vec *bvprv, struct scatterlist **sg) -{ - - int nbytes = bvec->bv_len; - - if (!*sg) + if (!iter->iter.bi_size) return false; - if ((*sg)->length + nbytes > queue_max_segment_size(q)) - return false; + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); - if (!biovec_phys_mergeable(q, bvprv, bvec)) - return false; + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } - (*sg)->length += nbytes; + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + vec->len = bv.bv_len; return true; } -static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist, - struct scatterlist **sg) +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) { - struct bio_vec bvec, bvprv = { NULL }; - struct bvec_iter iter; - int nsegs = 0; - bool new_bio = false; - - for_each_bio(bio) { - bio_for_each_bvec(bvec, bio, iter) { - /* - * Only try to merge bvecs from two bios given we - * have done bio internal merge when adding pages - * to bio - */ - if (new_bio && - __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg)) - goto next_bvec; - - if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) - nsegs += __blk_bvec_map_sg(bvec, sglist, sg); - else - nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg); - next_bvec: - new_bio = false; - } - if (likely(bio->bi_iter.bi_size)) { - bvprv = bvec; - new_bio = true; - } - } + if (!*sg) + return sglist; - return nsegs; + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); } /* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. */ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { + struct req_iterator iter = { + .bio = rq->bio, + .iter = rq->bio->bi_iter, + }; + struct phys_vec vec; int nsegs = 0; - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } if (*last_sg) sg_mark_end(*last_sg); From 6783811569aef24b949992bd5c4e6eaac02a0c30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:08 +0100 Subject: [PATCH 3/7] block: better split mq vs non-mq code in add_disk_fwnode Add a big conditional for blk-mq vs not mq at the beginning of add_disk_fwnode so that elevator_init_mq is only called for blk-mq disks, and add checks that the right methods or set or not set based on the queue type. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 5da3c9a64e6432..befb7a516bcf2c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -400,21 +400,23 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, struct device *ddev = disk_to_dev(disk); int ret; - /* Only makes sense for bio-based to set ->poll_bio */ - if (queue_is_mq(disk->queue) && disk->fops->poll_bio) - return -EINVAL; - - /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. - */ - elevator_init_mq(disk->queue); + if (queue_is_mq(disk->queue)) { + /* + * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. + */ + if (disk->fops->submit_bio || disk->fops->poll_bio) + return -EINVAL; - /* Mark bdev as having a submit_bio, if needed */ - if (disk->fops->submit_bio) + /* + * Initialize the I/O scheduler code and pick a default one if + * needed. + */ + elevator_init_mq(disk->queue); + } else { + if (!disk->fops->submit_bio) + return -EINVAL; bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); + } /* * If the driver provides an explicit major number it also must provide From 68ed45122249083bf45593ed635474282583352c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:09 +0100 Subject: [PATCH 4/7] block: remove blk_mq_init_bitmaps The little work done in blk_mq_init_bitmaps is easier done in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 38 ++++++++++++-------------------------- block/blk-mq.h | 3 --- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2cafcf11ee8bee..ab4a66791a2063 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -544,30 +544,12 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, - unsigned int queue_depth, unsigned int reserved, - int node, int alloc_policy) -{ - unsigned int depth = queue_depth - reserved; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - - if (bt_alloc(bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(breserved_tags, reserved, round_robin, node)) - goto free_bitmap_tags; - - return 0; - -free_bitmap_tags: - sbitmap_queue_free(bitmap_tags); - return -ENOMEM; -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) { + unsigned int depth = total_tags - reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -582,14 +564,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto out_free_tags; + if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) + goto out_free_bitmap_tags; - if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, - total_tags, reserved_tags, node, - alloc_policy) < 0) { - kfree(tags); - return NULL; - } return tags; + +out_free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +out_free_tags: + kfree(tags); + return NULL; } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq.h b/block/blk-mq.h index 89a20fffa4b1ce..3bb9ea80f9b6b6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -165,9 +165,6 @@ struct blk_mq_alloc_data { struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); void blk_mq_free_tags(struct blk_mq_tags *tags); -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, unsigned int queue_depth, - unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, From e7602bb4f3a1234df8b75728ac3260bcb8242612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:10 +0100 Subject: [PATCH 5/7] block: remove BLK_MQ_F_NO_SCHED The only queues that really can't support a scheduler are those that do not have a gendisk associated with them, and thus can't be used for non-passthrough commands. In addition to those null_blk can optionally set the flag, which is a bad odd. Replace the null_blk usage with BLK_MQ_F_NO_SCHED_BY_DEFAULT to keep the expected semantics and then remove BLK_MQ_F_NO_SCHED as the non-disk queues never call into elevator_init_mq or blk_register_queue which adds the sysfs attributes. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/bsg-lib.c | 2 +- block/elevator.c | 20 -------------------- drivers/block/null_blk/main.c | 4 ++-- drivers/nvme/host/apple.c | 1 - drivers/nvme/host/core.c | 1 - drivers/ufs/core/ufshcd.c | 1 - include/linux/blk-mq.h | 2 -- 8 files changed, 3 insertions(+), 29 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4b6b20ccdb5399..64b3c333aa476f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -185,7 +185,6 @@ static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), - HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 32da4a4429ce55..93523d8f819509 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -381,7 +381,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct bsg_job) + dd_job_size; - set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + set->flags = BLK_MQ_F_BLOCKING; if (blk_mq_alloc_tag_set(set)) goto out_tag_set; diff --git a/block/elevator.c b/block/elevator.c index be6e994256ac8a..b81216c48b6bcb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -547,14 +547,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (!queue_is_mq(q) || - (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) - return false; - return true; -} - /* * For single queue devices, default to using mq-deadline. If we have multiple * queues or mq-deadline is not available, default to "none". @@ -580,9 +572,6 @@ void elevator_init_mq(struct request_queue *q) struct elevator_type *e; int err; - if (!elv_support_iosched(q)) - return; - WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) @@ -714,9 +703,6 @@ void elv_iosched_load_module(struct gendisk *disk, const char *buf, struct elevator_type *found; const char *name; - if (!elv_support_iosched(disk->queue)) - return; - strscpy(elevator_name, buf, sizeof(elevator_name)); name = strstrip(elevator_name); @@ -734,9 +720,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, char elevator_name[ELV_NAME_MAX]; int ret; - if (!elv_support_iosched(disk->queue)) - return count; - strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) @@ -751,9 +734,6 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; - if (!elv_support_iosched(q)) - return sprintf(name, "none\n"); - if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 178e62cd9a9f28..d94ef37480bd45 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1792,7 +1792,7 @@ static int null_init_global_tag_set(void) tag_set.queue_depth = g_hw_queue_depth; tag_set.numa_node = g_home_node; if (g_no_sched) - tag_set.flags |= BLK_MQ_F_NO_SCHED; + tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (g_shared_tag_bitmap) tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (g_blocking) @@ -1817,7 +1817,7 @@ static int null_setup_tagset(struct nullb *nullb) nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; nullb->tag_set->numa_node = nullb->dev->home_node; if (nullb->dev->no_sched) - nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; + nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (nullb->dev->shared_tag_bitmap) nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (nullb->dev->blocking) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 83c60468542c88..1de11b722f049a 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1251,7 +1251,6 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv) anv->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; anv->admin_tagset.numa_node = NUMA_NO_NODE; anv->admin_tagset.cmd_size = sizeof(struct apple_nvme_iod); - anv->admin_tagset.flags = BLK_MQ_F_NO_SCHED; anv->admin_tagset.driver_data = &anv->adminq; ret = blk_mq_alloc_tag_set(&anv->admin_tagset); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 42283d268500d6..c2250ddef5a299 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4564,7 +4564,6 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, /* Reserved for fabric connect and keep alive */ set->reserved_tags = 2; set->numa_node = ctrl->numa_node; - set->flags = BLK_MQ_F_NO_SCHED; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8a01e4393159d5..fd53c9f402c342 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10412,7 +10412,6 @@ static int ufshcd_add_scsi_host(struct ufs_hba *hba) .nr_hw_queues = 1, .queue_depth = hba->nutmrs, .ops = &ufshcd_tmf_ops, - .flags = BLK_MQ_F_NO_SCHED, }; err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6340293511c999..f2ff0ffa05351d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -676,8 +676,6 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, - /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq From ce32496ec1abe866225f2e2005ceda68cf4c7bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:11 +0100 Subject: [PATCH 6/7] block: simplify tag allocation policy selection Use a plain BLK_MQ_F_* flag to select the round robin tag selection instead of overlaying an enum with just two possible values into the flags space. Doing so allows adding a BLK_MQ_F_MAX sentinel for simplified overflow checking in the messy debugfs helpers. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 25 ++++--------------------- block/blk-mq-tag.c | 5 ++--- block/blk-mq.c | 3 +-- block/blk-mq.h | 2 +- drivers/ata/ahci.h | 2 +- drivers/ata/pata_macio.c | 2 +- drivers/ata/sata_mv.c | 2 +- drivers/ata/sata_nv.c | 4 ++-- drivers/ata/sata_sil24.c | 1 - drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- drivers/scsi/scsi_lib.c | 4 ++-- include/linux/blk-mq.h | 22 +++++++--------------- include/linux/libata.h | 4 ++-- include/scsi/scsi_host.h | 6 ++++-- 14 files changed, 29 insertions(+), 55 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 64b3c333aa476f..adf5f0697b6b04 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -172,19 +172,13 @@ static int hctx_state_show(void *data, struct seq_file *m) return 0; } -#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name -static const char *const alloc_policy_name[] = { - BLK_TAG_ALLOC_NAME(FIFO), - BLK_TAG_ALLOC_NAME(RR), -}; -#undef BLK_TAG_ALLOC_NAME - #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), + HCTX_FLAG_NAME(TAG_RR), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME @@ -192,22 +186,11 @@ static const char *const hctx_flag_name[] = { static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); - BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != - BLK_MQ_F_ALLOC_POLICY_START_BIT); - BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX); + BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); - seq_puts(m, "alloc_policy="); - if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && - alloc_policy_name[alloc_policy]) - seq_puts(m, alloc_policy_name[alloc_policy]); - else - seq_printf(m, "%d", alloc_policy); - seq_puts(m, " "); - blk_flags_show(m, - hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), - hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); + blk_flags_show(m, hctx->flags, hctx_flag_name, + ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ab4a66791a2063..b9f417d980b46d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -545,11 +545,10 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, - int node, int alloc_policy) + unsigned int reserved_tags, unsigned int flags, int node) { unsigned int depth = total_tags - reserved_tags; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + bool round_robin = flags & BLK_MQ_F_TAG_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 17f10683d640f7..2e6132f778fd95 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3476,8 +3476,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; diff --git a/block/blk-mq.h b/block/blk-mq.h index 3bb9ea80f9b6b6..c872bbbe641185 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -163,7 +163,7 @@ struct blk_mq_alloc_data { }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, - unsigned int reserved_tags, int node, int alloc_policy); + unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 8f40f75ba08cff..06781bdde0d287 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -396,7 +396,7 @@ extern const struct attribute_group *ahci_sdev_groups[]; .shost_groups = ahci_shost_groups, \ .sdev_groups = ahci_sdev_groups, \ .change_queue_depth = ata_scsi_change_queue_depth, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure extern struct ata_port_operations ahci_ops; diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index f2f36e55a1f4d2..4b01bb6880b031 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -935,7 +935,7 @@ static const struct scsi_host_template pata_macio_sht = { .device_configure = pata_macio_device_configure, .sdev_groups = ata_common_sdev_groups, .can_queue = ATA_DEF_QUEUE, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; static struct ata_port_operations pata_macio_ops = { diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index b8f363370e1aa4..21c72650f9ccdf 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -672,7 +672,7 @@ static const struct scsi_host_template mv6_sht = { .dma_boundary = MV_DMA_BOUNDARY, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, .device_configure = ata_scsi_device_configure }; diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 36d99043ef50ff..823cce5ea1e901 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -385,7 +385,7 @@ static const struct scsi_host_template nv_adma_sht = { .device_configure = nv_adma_device_configure, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; static const struct scsi_host_template nv_swncq_sht = { @@ -396,7 +396,7 @@ static const struct scsi_host_template nv_swncq_sht = { .device_configure = nv_swncq_device_configure, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; /* diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index 72c03cbdaff43b..935b13e79decc4 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -378,7 +378,6 @@ static const struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, - .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, .device_configure = ata_scsi_device_configure diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 79129c9777048c..35501d0aa65559 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3345,7 +3345,7 @@ static const struct scsi_host_template sht_v3_hw = { .slave_alloc = hisi_sas_slave_alloc, .shost_groups = host_v3_hw_groups, .sdev_groups = sdev_groups_v3_hw, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, .host_reset = hisi_sas_host_reset, .host_tagset = 1, .mq_poll = queue_complete_v3_hw, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5cf124e130974c..51c496ca93806e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2065,8 +2065,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = dev_to_node(shost->dma_dev); - tag_set->flags |= - BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); + if (shost->hostt->tag_alloc_policy_rr) + tag_set->flags |= BLK_MQ_F_TAG_RR; if (shost->queuecommand_may_block) tag_set->flags |= BLK_MQ_F_BLOCKING; tag_set->driver_data = shost; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2ff0ffa05351d..a0a9007cc1e36f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -296,13 +296,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -/* Keep alloc_policy_name[] in sync with the definitions below */ -enum { - BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ - BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ - BLK_TAG_ALLOC_MAX -}; - /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -677,20 +670,19 @@ enum { BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, + /* + * Alloc tags on a round-robin base instead of the first available one. + */ + BLK_MQ_F_TAG_RR = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, - BLK_MQ_F_ALLOC_POLICY_BITS = 1, + + BLK_MQ_F_MAX = 1 << 7, }; -#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ - ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ - ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) -#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ - ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ - << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6d5..be5183d75736dc 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1467,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; #define ATA_SUBBASE_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_BASE_SHT(drv_name) \ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 2b4ab0369ffbe9..02823d6af37de6 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -438,8 +438,10 @@ struct scsi_host_template { */ short cmd_per_lun; - /* If use block layer to manage tags, this is tag allocation policy */ - int tag_alloc_policy; + /* + * Allocate tags starting from last allocated tag. + */ + bool tag_alloc_policy_rr : 1; /* * Track QUEUE_FULL events and reduce queue depth on demand. From 844b8cdc681612ff24df62cdefddeab5772fadf1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 3 Jan 2025 17:28:59 +0800 Subject: [PATCH 7/7] nbd: don't allow reconnect after disconnect Following process can cause nbd_config UAF: 1) grab nbd_config temporarily; 2) nbd_genl_disconnect() flush all recv_work() and release the initial reference: nbd_genl_disconnect nbd_disconnect_and_put nbd_disconnect flush_workqueue(nbd->recv_workq) if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, ...)) nbd_config_put -> due to step 1), reference is still not zero 3) nbd_genl_reconfigure() queue recv_work() again; nbd_genl_reconfigure config = nbd_get_config_unlocked(nbd) if (!config) -> succeed if (!test_bit(NBD_RT_BOUND, ...)) -> succeed nbd_reconnect_socket queue_work(nbd->recv_workq, &args->work) 4) step 1) release the reference; 5) Finially, recv_work() will trigger UAF: recv_work nbd_config_put(nbd) -> nbd_config is freed atomic_dec(&config->recv_threads) -> UAF Fix the problem by clearing NBD_RT_BOUND in nbd_genl_disconnect(), so that nbd_genl_reconfigure() will fail. Fixes: b7aa3d39385d ("nbd: add a reconfigure netlink command") Reported-by: syzbot+6b0df248918b92c33e6a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/675bfb65.050a0220.1a2d0d.0006.GAE@google.com/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250103092859.3574648-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b1a5af69a66d78..259bd57fc52935 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2179,6 +2179,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) flush_workqueue(nbd->recv_workq); nbd_clear_que(nbd); nbd->task_setup = NULL; + clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags); mutex_unlock(&nbd->config_lock); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,