Skip to content

Commit 5a409b4

Browse files
XiaoNi87shligit
authored andcommitted
MD: fix lock contention for flush bios
There is a lock contention when there are many processes which send flush bios to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio to be NULL, otherwise get mddev->lock. This patch remove mddev->flush_bio and handle flush bio asynchronously. I did a test with command dbench -s 128 -t 300. This is the test result: =================Without the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 11165 167.595 5879.560 Close 107469 1.391 2231.094 LockX 384 0.003 0.019 Rename 5944 2.141 1856.001 ReadX 208121 0.003 0.074 WriteX 98259 1925.402 15204.895 Unlink 25198 13.264 3457.268 UnlockX 384 0.001 0.009 FIND_FIRST 47111 0.012 0.076 SET_FILE_INFORMATION 12966 0.007 0.065 QUERY_FILE_INFORMATION 27921 0.004 0.085 QUERY_PATH_INFORMATION 124650 0.005 5.766 QUERY_FS_INFORMATION 22519 0.003 0.053 NTCreateX 141086 4.291 2502.812 Throughput 3.7181 MB/sec (sync open) 128 clients 128 procs max_latency=15204.905 ms =================With the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 4500 174.134 406.398 Close 48195 0.060 467.062 LockX 256 0.003 0.029 Rename 2324 0.026 0.360 ReadX 78846 0.004 0.504 WriteX 66832 562.775 1467.037 Unlink 5516 3.665 1141.740 UnlockX 256 0.002 0.019 FIND_FIRST 16428 0.015 0.313 SET_FILE_INFORMATION 6400 0.009 0.520 QUERY_FILE_INFORMATION 17865 0.003 0.089 QUERY_PATH_INFORMATION 47060 0.078 416.299 QUERY_FS_INFORMATION 7024 0.004 0.032 NTCreateX 55921 0.854 1141.452 Throughput 11.744 MB/sec (sync open) 128 clients 128 procs max_latency=1467.041 ms The test is done on raid1 disk with two rotational disks V5: V4 is more complicated than the version with memory pool. So revert to the memory pool version V4: use address of fbio to do hash to choose free flush info. V3: Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device and uses a simple bitmap to record which resource is free. Fix a bug from v2. It should set flush_pending to 1 at first. V2: Neil pointed out two problems. One is counting error problem and another is return value when allocat memory fails. 1. counting error problem This isn't safe. It is only safe to call rdev_dec_pending() on rdevs that you previously called atomic_inc(&rdev->nr_pending); If an rdev was added to the list between the start and end of the flush, this will do something bad. Now it doesn't use bio_chain. It uses specified call back function for each flush bio. 2. Returned on IO error when kmalloc fails is wrong. I use mempool suggested by Neil in V2 3. Fixed some places pointed by Guoqing Suggested-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Xiao Ni <xni@redhat.com> Signed-off-by: Shaohua Li <shli@fb.com>
1 parent 448ec63 commit 5a409b4

File tree

2 files changed

+120
-60
lines changed

2 files changed

+120
-60
lines changed

drivers/md/md.c

Lines changed: 105 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
130130
mddev->sync_speed_max : sysctl_speed_limit_max;
131131
}
132132

133+
static void * flush_info_alloc(gfp_t gfp_flags, void *data)
134+
{
135+
return kzalloc(sizeof(struct flush_info), gfp_flags);
136+
}
137+
static void flush_info_free(void *flush_info, void *data)
138+
{
139+
kfree(flush_info);
140+
}
141+
142+
static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
143+
{
144+
return kzalloc(sizeof(struct flush_bio), gfp_flags);
145+
}
146+
static void flush_bio_free(void *flush_bio, void *data)
147+
{
148+
kfree(flush_bio);
149+
}
150+
133151
static struct ctl_table_header *raid_table_header;
134152

135153
static struct ctl_table raid_table[] = {
@@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
412430
/*
413431
* Generic flush handling for md
414432
*/
433+
static void submit_flushes(struct work_struct *ws)
434+
{
435+
struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
436+
struct mddev *mddev = fi->mddev;
437+
struct bio *bio = fi->bio;
438+
439+
bio->bi_opf &= ~REQ_PREFLUSH;
440+
md_handle_request(mddev, bio);
441+
442+
mempool_free(fi, mddev->flush_pool);
443+
}
415444

416-
static void md_end_flush(struct bio *bio)
445+
static void md_end_flush(struct bio *fbio)
417446
{
418-
struct md_rdev *rdev = bio->bi_private;
419-
struct mddev *mddev = rdev->mddev;
447+
struct flush_bio *fb = fbio->bi_private;
448+
struct md_rdev *rdev = fb->rdev;
449+
struct flush_info *fi = fb->fi;
450+
struct bio *bio = fi->bio;
451+
struct mddev *mddev = fi->mddev;
420452

421453
rdev_dec_pending(rdev, mddev);
422454

423-
if (atomic_dec_and_test(&mddev->flush_pending)) {
424-
/* The pre-request flush has finished */
425-
queue_work(md_wq, &mddev->flush_work);
455+
if (atomic_dec_and_test(&fi->flush_pending)) {
456+
if (bio->bi_iter.bi_size == 0)
457+
/* an empty barrier - all done */
458+
bio_endio(bio);
459+
else {
460+
INIT_WORK(&fi->flush_work, submit_flushes);
461+
queue_work(md_wq, &fi->flush_work);
462+
}
426463
}
427-
bio_put(bio);
428-
}
429464

430-
static void md_submit_flush_data(struct work_struct *ws);
465+
mempool_free(fb, mddev->flush_bio_pool);
466+
bio_put(fbio);
467+
}
431468

432-
static void submit_flushes(struct work_struct *ws)
469+
void md_flush_request(struct mddev *mddev, struct bio *bio)
433470
{
434-
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
435471
struct md_rdev *rdev;
472+
struct flush_info *fi;
473+
474+
fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
475+
476+
fi->bio = bio;
477+
fi->mddev = mddev;
478+
atomic_set(&fi->flush_pending, 1);
436479

437-
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
438-
atomic_set(&mddev->flush_pending, 1);
439480
rcu_read_lock();
440481
rdev_for_each_rcu(rdev, mddev)
441482
if (rdev->raid_disk >= 0 &&
@@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws)
445486
* we reclaim rcu_read_lock
446487
*/
447488
struct bio *bi;
489+
struct flush_bio *fb;
448490
atomic_inc(&rdev->nr_pending);
449491
atomic_inc(&rdev->nr_pending);
450492
rcu_read_unlock();
493+
494+
fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
495+
fb->fi = fi;
496+
fb->rdev = rdev;
497+
451498
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
452-
bi->bi_end_io = md_end_flush;
453-
bi->bi_private = rdev;
454499
bio_set_dev(bi, rdev->bdev);
500+
bi->bi_end_io = md_end_flush;
501+
bi->bi_private = fb;
455502
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
456-
atomic_inc(&mddev->flush_pending);
503+
504+
atomic_inc(&fi->flush_pending);
457505
submit_bio(bi);
506+
458507
rcu_read_lock();
459508
rdev_dec_pending(rdev, mddev);
460509
}
461510
rcu_read_unlock();
462-
if (atomic_dec_and_test(&mddev->flush_pending))
463-
queue_work(md_wq, &mddev->flush_work);
464-
}
465-
466-
static void md_submit_flush_data(struct work_struct *ws)
467-
{
468-
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
469-
struct bio *bio = mddev->flush_bio;
470511

471-
/*
472-
* must reset flush_bio before calling into md_handle_request to avoid a
473-
* deadlock, because other bios passed md_handle_request suspend check
474-
* could wait for this and below md_handle_request could wait for those
475-
* bios because of suspend check
476-
*/
477-
mddev->flush_bio = NULL;
478-
wake_up(&mddev->sb_wait);
479-
480-
if (bio->bi_iter.bi_size == 0)
481-
/* an empty barrier - all done */
482-
bio_endio(bio);
483-
else {
484-
bio->bi_opf &= ~REQ_PREFLUSH;
485-
md_handle_request(mddev, bio);
512+
if (atomic_dec_and_test(&fi->flush_pending)) {
513+
if (bio->bi_iter.bi_size == 0)
514+
/* an empty barrier - all done */
515+
bio_endio(bio);
516+
else {
517+
INIT_WORK(&fi->flush_work, submit_flushes);
518+
queue_work(md_wq, &fi->flush_work);
519+
}
486520
}
487521
}
488-
489-
void md_flush_request(struct mddev *mddev, struct bio *bio)
490-
{
491-
spin_lock_irq(&mddev->lock);
492-
wait_event_lock_irq(mddev->sb_wait,
493-
!mddev->flush_bio,
494-
mddev->lock);
495-
mddev->flush_bio = bio;
496-
spin_unlock_irq(&mddev->lock);
497-
498-
INIT_WORK(&mddev->flush_work, submit_flushes);
499-
queue_work(md_wq, &mddev->flush_work);
500-
}
501522
EXPORT_SYMBOL(md_flush_request);
502523

503524
static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev)
555576
atomic_set(&mddev->openers, 0);
556577
atomic_set(&mddev->active_io, 0);
557578
spin_lock_init(&mddev->lock);
558-
atomic_set(&mddev->flush_pending, 0);
559579
init_waitqueue_head(&mddev->sb_wait);
560580
init_waitqueue_head(&mddev->recovery_wait);
561581
mddev->reshape_position = MaxSector;
@@ -5510,6 +5530,22 @@ int md_run(struct mddev *mddev)
55105530
goto abort;
55115531
}
55125532
}
5533+
if (mddev->flush_pool == NULL) {
5534+
mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
5535+
flush_info_free, mddev);
5536+
if (!mddev->flush_pool) {
5537+
err = -ENOMEM;
5538+
goto abort;
5539+
}
5540+
}
5541+
if (mddev->flush_bio_pool == NULL) {
5542+
mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
5543+
flush_bio_free, mddev);
5544+
if (!mddev->flush_bio_pool) {
5545+
err = -ENOMEM;
5546+
goto abort;
5547+
}
5548+
}
55135549

55145550
spin_lock(&pers_lock);
55155551
pers = find_pers(mddev->level, mddev->clevel);
@@ -5669,6 +5705,14 @@ int md_run(struct mddev *mddev)
56695705
return 0;
56705706

56715707
abort:
5708+
if (mddev->flush_bio_pool) {
5709+
mempool_destroy(mddev->flush_bio_pool);
5710+
mddev->flush_bio_pool = NULL;
5711+
}
5712+
if (mddev->flush_pool){
5713+
mempool_destroy(mddev->flush_pool);
5714+
mddev->flush_pool = NULL;
5715+
}
56725716
if (mddev->bio_set) {
56735717
bioset_free(mddev->bio_set);
56745718
mddev->bio_set = NULL;
@@ -5889,6 +5933,14 @@ void md_stop(struct mddev *mddev)
58895933
* This is called from dm-raid
58905934
*/
58915935
__md_stop(mddev);
5936+
if (mddev->flush_bio_pool) {
5937+
mempool_destroy(mddev->flush_bio_pool);
5938+
mddev->flush_bio_pool = NULL;
5939+
}
5940+
if (mddev->flush_pool) {
5941+
mempool_destroy(mddev->flush_pool);
5942+
mddev->flush_pool = NULL;
5943+
}
58925944
if (mddev->bio_set) {
58935945
bioset_free(mddev->bio_set);
58945946
mddev->bio_set = NULL;

drivers/md/md.h

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
252252
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
253253
};
254254

255+
#define NR_FLUSH_INFOS 8
256+
#define NR_FLUSH_BIOS 64
257+
struct flush_info {
258+
struct bio *bio;
259+
struct mddev *mddev;
260+
struct work_struct flush_work;
261+
atomic_t flush_pending;
262+
};
263+
struct flush_bio {
264+
struct flush_info *fi;
265+
struct md_rdev *rdev;
266+
};
267+
255268
struct mddev {
256269
void *private;
257270
struct md_personality *pers;
@@ -457,13 +470,8 @@ struct mddev {
457470
* metadata and bitmap writes
458471
*/
459472

460-
/* Generic flush handling.
461-
* The last to finish preflush schedules a worker to submit
462-
* the rest of the request (without the REQ_PREFLUSH flag).
463-
*/
464-
struct bio *flush_bio;
465-
atomic_t flush_pending;
466-
struct work_struct flush_work;
473+
mempool_t *flush_pool;
474+
mempool_t *flush_bio_pool;
467475
struct work_struct event_work; /* used by dm to report failure event */
468476
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
469477
struct md_cluster_info *cluster_info;

0 commit comments

Comments
 (0)