From c6ea70604249bc357ce09e9f8e16c29df0fb2fa2 Mon Sep 17 00:00:00 2001 From: "dougmill@linux.vnet.ibm.com" Date: Tue, 16 Aug 2022 15:07:13 +0100 Subject: block: sed-opal: Add ioctl to return device status Provide a mechanism to retrieve basic status information about the device, including the "supported" flag indicating whether SED-OPAL is supported. The information returned is from the various feature descriptors received during the discovery0 step, and so this ioctl does nothing more than perform the discovery0 step and then save the information received. See "struct opal_status" and OPAL_FL_* bits for the status information currently returned. This is necessary to be able to check whether a device is OPAL enabled, set up, locked or unlocked from userspace programs like systemd-cryptsetup and libcryptsetup. Right now we just have to assume the user 'knows' or blindly attempt setup/lock/unlock operations. Signed-off-by: Douglas Miller Tested-by: Luca Boccassi Reviewed-by: Scott Bauer Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816140713.84893-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- block/opal_proto.h | 5 +++ block/sed-opal.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 82 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index b486b3ec7dc4..7152aa1f1a49 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -39,7 +39,12 @@ enum opal_response_token { #define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 +/* FC_LOCKING features */ +#define LOCKING_SUPPORTED_MASK 0x01 +#define LOCKING_ENABLED_MASK 0x02 +#define LOCKED_MASK 0x04 #define MBR_ENABLED_MASK 0x10 +#define MBR_DONE_MASK 0x20 #define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_SIGNED 0x40 diff --git a/block/sed-opal.c b/block/sed-opal.c index 9700197000f2..2c5327a0543a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -74,8 +74,7 @@ struct parsed_resp { }; struct opal_dev { - bool supported; - bool mbr_enabled; + u32 flags; void *data; sec_send_recv *send_recv; @@ -280,6 +279,30 @@ static bool check_tper(const void *data) return true; } +static bool check_lcksuppt(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_SUPPORTED_MASK); +} + +static bool check_lckenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_ENABLED_MASK); +} + +static bool check_locked(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKED_MASK); +} + static bool check_mbrenabled(const void *data) { const struct d0_locking_features *lfeat = data; @@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data) return !!(sup_feat & MBR_ENABLED_MASK); } +static bool check_mbrdone(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_DONE_MASK); +} + static bool check_sum(const void *data) { const struct d0_single_user_mode *sum = data; @@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev) u32 hlen = be32_to_cpu(hdr->length); print_buffer(dev->resp, hlen); - dev->mbr_enabled = false; + dev->flags &= OPAL_FL_SUPPORTED; if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", @@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev) check_geometry(dev, body); break; case FC_LOCKING: - dev->mbr_enabled = check_mbrenabled(body->features); + if (check_lcksuppt(body->features)) + dev->flags |= OPAL_FL_LOCKING_SUPPORTED; + if (check_lckenabled(body->features)) + dev->flags |= OPAL_FL_LOCKING_ENABLED; + if (check_locked(body->features)) + dev->flags |= OPAL_FL_LOCKED; + if (check_mbrenabled(body->features)) + dev->flags |= OPAL_FL_MBR_ENABLED; + if (check_mbrdone(body->features)) + dev->flags |= OPAL_FL_MBR_DONE; break; case FC_ENTERPRISE: case FC_DATASTORE: @@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev) mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = opal_discovery0_step(dev); - dev->supported = !ret; + if (!ret) + dev->flags |= OPAL_FL_SUPPORTED; mutex_unlock(&dev->dev_lock); return ret; @@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) INIT_LIST_HEAD(&dev->unlk_lst); mutex_init(&dev->dev_lock); + dev->flags = 0; dev->data = data; dev->send_recv = send_recv; if (check_opal_support(dev) != 0) { @@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return false; mutex_lock(&dev->dev_lock); @@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) was_failure = true; } - if (dev->mbr_enabled) { + if (dev->flags & OPAL_FL_MBR_ENABLED) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) pr_debug("Failed to set MBR Done in S3 resume\n"); @@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev, return ret; } +static int opal_get_status(struct opal_dev *dev, void __user *data) +{ + struct opal_status sts = {0}; + + /* + * check_opal_support() error is not fatal, + * !dev->supported is a valid condition + */ + if (!check_opal_support(dev)) + sts.flags = dev->flags; + if (copy_to_user(data, &sts, sizeof(sts))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + return 0; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return -EACCES; if (!dev) return -ENOTSUPP; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return -ENOTSUPP; - p = memdup_user(arg, _IOC_SIZE(cmd)); - if (IS_ERR(p)) - return PTR_ERR(p); + if (cmd & IOC_IN) { + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + } switch (cmd) { case IOC_OPAL_SAVE: @@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GENERIC_TABLE_RW: ret = opal_generic_read_write_table(dev, p); break; + case IOC_OPAL_GET_STATUS: + ret = opal_get_status(dev, arg); + break; default: break; } - kfree(p); + if (cmd & IOC_IN) + kfree(p); return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); -- cgit From a4e1d0b76e7b32c0839e72679c530445172a2564 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 15 Aug 2022 10:00:43 -0700 Subject: block: Change the return type of blk_mq_map_queues() into void Since blk_mq_map_queues() and the .map_queues() callbacks always return 0, change their return type into void. Most callers ignore the returned value anyway. Cc: Christoph Hellwig Cc: Jason Wang Cc: Keith Busch Cc: Martin K. Petersen Cc: Doug Gilbert Cc: Michael S. Tsirkin Signed-off-by: Bart Van Assche Reviewed-by: John Garry Acked-by: Md Haris Iqbal Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220815170043.19489-3-bvanassche@acm.org [axboe: fold in fix from Bart] Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 4 +--- block/blk-mq-pci.c | 7 +++---- block/blk-mq-rdma.c | 6 +++--- block/blk-mq-virtio.c | 7 ++++--- block/blk-mq.c | 10 ++++------ 5 files changed, 15 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3db84d3197f1..9c2fce1a7b50 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -32,7 +32,7 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_map_queues(struct blk_mq_queue_map *qmap) +void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { unsigned int *map = qmap->mq_map; unsigned int nr_queues = qmap->nr_queues; @@ -70,8 +70,6 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap) map[cpu] = map[first_sibling]; } } - - return 0; } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index b595a94c4d16..a90b88fd1332 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -23,8 +23,8 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset) { const struct cpumask *mask; unsigned int queue, cpu; @@ -38,11 +38,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; fallback: WARN_ON_ONCE(qmap->nr_queues > 1); blk_mq_clear_mq_map(qmap); - return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 14f968e58b8f..29c1f4d6eb04 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c @@ -21,7 +21,7 @@ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a * vector, we fallback to the naive mapping. */ -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec) { const struct cpumask *mask; @@ -36,9 +36,9 @@ int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, map->mq_map[cpu] = map->queue_offset + queue; } - return 0; + return; fallback: - return blk_mq_map_queues(map); + blk_mq_map_queues(map); } EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 7b8a42c35102..6589f076a096 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -21,7 +21,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -39,8 +39,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; + fallback: - return blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 3c1e6b6d991d..4b90d2d8cfb0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4190,7 +4190,7 @@ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) return 0; } -static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) +static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations @@ -4220,10 +4220,10 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); - return set->ops->map_queues(set); + set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } @@ -4322,9 +4322,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; } - ret = blk_mq_update_queue_map(set); - if (ret) - goto out_free_mq_map; + blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) -- cgit From c2090eabacd72910a5edf57cf42004097a13ddad Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 16 Aug 2022 09:56:29 +0800 Subject: block, bfq: remove unused functions While doing code coverage testing(CONFIG_BFQ_CGROUP_DEBUG is disabled), we found that some functions doesn't have caller, thus remove them. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220816015631.1323948-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 5 ----- block/bfq-iosched.h | 13 ++++++++----- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 30b15a9a47c4..144bca006463 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -254,17 +254,12 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, #else /* CONFIG_BFQ_CGROUP_DEBUG */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf) { } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ad8e513d7e87..f81ab3c8fa3c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -993,20 +993,23 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - blk_opf_t opf); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -void bfqg_stats_update_idle_time(struct bfq_group *bfqg); void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_CGROUP_DEBUG +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + blk_opf_t opf); +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +void bfqg_stats_update_idle_time(struct bfq_group *bfqg); +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); +#endif + void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -- cgit From 1e3cc2125d7cc7d492b2e6e52d09c1e17ba573c3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 16 Aug 2022 09:56:30 +0800 Subject: block, bfq: remove useless checking in bfq_put_queue() 'bfqq->bfqd' is ensured to set in bfq_init_queue(), and it will never change afterwards. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220816015631.1323948-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c740b41fe0a4..f39067389b2b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5255,9 +5255,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) struct hlist_node *n; struct bfq_group *bfqg = bfqq_group(bfqq); - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", - bfqq, bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); bfqq->ref--; if (bfqq->ref) @@ -5321,7 +5319,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_del_init(&item->woken_list_node); } - if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) + if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; kmem_cache_free(bfq_pool, bfqq); -- cgit From d322f355e9368045ff89b7a3df9324fe0c33839b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 16 Aug 2022 09:56:31 +0800 Subject: block, bfq: remove useless parameter for bfq_add/del_bfqq_busy() 'bfqd' can be accessed through 'bfqq->bfqd', there is no need to pass it as a parameter separately. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220816015631.1323948-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++---- block/bfq-iosched.h | 5 ++--- block/bfq-wf2q.c | 9 ++++++--- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f39067389b2b..7ea427817f7f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->service_from_backlogged = 0; bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); + bfq_add_bfqq_busy(bfqq); /* * Expire in-service queue if preemption may be needed for @@ -2419,7 +2419,7 @@ static void bfq_remove_request(struct request_queue *q, bfqq->next_rq = NULL; if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); /* * bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and @@ -3098,7 +3098,7 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); bfq_reassign_last_bfqq(bfqq, NULL); @@ -3908,7 +3908,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, true); + bfq_del_bfqq_busy(bfqq, true); } else { bfq_requeue_bfqq(bfqd, bfqq, true); /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index f81ab3c8fa3c..64ee618064ba 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1080,9 +1080,8 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration); -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); +void bfq_add_bfqq_busy(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 983413cdefad..8fc3da4c23bb 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1651,9 +1651,10 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, * the service tree. As a special case, it can be invoked during an * expiration. */ -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "del from busy"); bfq_clear_bfqq_busy(bfqq); @@ -1674,8 +1675,10 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* * Called when an inactive queue receives a new request. */ -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +void bfq_add_bfqq_busy(struct bfq_queue *bfqq) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "add to busy"); bfq_activate_bfqq(bfqd, bfqq); -- cgit From f5d632d15e9e0a037339601680d82bb840f85d10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2022 16:39:04 -0600 Subject: block: shrink rq_map_data a bit We don't need full ints for several of these members. Change the page_order and nr_entries to unsigned shorts, and the true/false from_user and null_mapped to booleans. This shrinks the struct from 32 to 24 bytes on 64-bit archs. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 7196a6b64c80..379c52d2f2d1 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -158,7 +158,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { - nr_pages = 1 << map_data->page_order; + nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { -- cgit From 8af870aa5b84729d7a39f84226be6f84e4943d8f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2022 16:43:09 -0600 Subject: block: enable bio caching use for passthru IO bdev based polled O_DIRECT is currently quite a bit faster than passthru on the same device, and one of the reaons is that we're not able to use the bio caching for passthru IO. If REQ_POLLED is set on the request, use the fs bio set for grabbing a bio from the caches, if available. This saves 5-6% of CPU over head for polled passthru IO. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-map.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 379c52d2f2d1..16153fdc478f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -231,6 +231,16 @@ out_bmd: return ret; } +static void bio_map_put(struct bio *bio) +{ + if (bio->bi_opf & REQ_ALLOC_CACHE) { + bio_put(bio); + } else { + bio_uninit(bio); + kfree(bio); + } +} + static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { @@ -243,10 +253,19 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(nr_vecs, gfp_mask); - if (!bio) - return -ENOMEM; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + if (rq->cmd_flags & REQ_POLLED) { + blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, + &fs_bio_set); + if (!bio) + return -ENOMEM; + } else { + bio = bio_kmalloc(nr_vecs, gfp_mask); + if (!bio) + return -ENOMEM; + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + } while (iov_iter_count(iter)) { struct page **pages; @@ -305,8 +324,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_uninit(bio); - kfree(bio); + bio_map_put(bio); return ret; } @@ -611,8 +629,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_uninit(next_bio); - kfree(next_bio); + bio_map_put(next_bio); } return ret; -- cgit From e88811bc43b971e06fa82d4e421e21b3c999c1a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2022 16:44:34 -0600 Subject: block: use on-stack page vec for <= UIO_FASTIOV Avoid a kmalloc+kfree for each page array, if we only have a few pages that are mapped. An alloc+free for each IO is quite expensive, and it's pretty pointless if we're only dealing with 1 or a few vecs. Use UIO_FASTIOV like we do in other spots to set a sane limit for how big of an IO we want to avoid allocations for. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-map.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 16153fdc478f..f3768876d618 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -268,12 +268,19 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, } while (iov_iter_count(iter)) { - struct page **pages; + struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; size_t offs, added = 0; int npages; - bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); + if (nr_vecs <= ARRAY_SIZE(stack_pages)) { + pages = stack_pages; + bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, + nr_vecs, &offs); + } else { + bytes = iov_iter_get_pages_alloc2(iter, &pages, + LONG_MAX, &offs); + } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -309,7 +316,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, */ while (j < npages) put_page(pages[j++]); - kvfree(pages); + if (pages != stack_pages) + kvfree(pages); /* couldn't stuff something into bio? */ if (bytes) { iov_iter_revert(iter, bytes); -- cgit From 16ede66973c84f890c03584f79158dd5b2d725f5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 25 Aug 2022 07:53:12 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220825145312.1217900-1-kbusch@fb.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8e3b36d1cb57..9eb968e14d31 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev); + sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); -- cgit From 12c5b70c1897288ee6c841b5cc3ff4d27d511bd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Sep 2022 10:40:11 -0600 Subject: block: enable per-cpu bio caching for the fs bio set This is useful for polled IO on a file, or for polled IO with the io_uring passthrough mechanism. If bio allocations are done with REQ_POLLED for those cases, then initializing the bio set with BIOSET_PERCPU_CACHE enables the local per-cpu cache which eliminates allocations (and frees) of bio structs when possible. Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea2..d3154d8beed7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1754,7 +1754,8 @@ static int __init init_bio(void) cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) -- cgit From bce1b56c73826fec8caf6187f0c922ede397a5a8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 4 Sep 2022 06:39:25 -0600 Subject: Revert "sbitmap: fix batched wait_cnt accounting" This reverts commit 16ede66973c84f890c03584f79158dd5b2d725f5. This is causing issues with CPU stalls on my test box, revert it for now until we understand what is going on. It looks like infinite looping off sbitmap_queue_wake_up(), but hard to tell with a lot of CPUs hitting this issue and the console scrolling infinitely. Link: https://lore.kernel.org/linux-block/e742813b-ce5c-0d58-205b-1626f639b1bd@kernel.dk/ Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9eb968e14d31..8e3b36d1cb57 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev, 1); + sbitmap_queue_wake_up(bt_prev); ws = bt_wait_ptr(bt, data->hctx); } while (1); -- cgit From 2d8f7a3b9fb31d2566b24fd94d5a533f9322c53c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 3 Sep 2022 14:28:26 +0800 Subject: blk-throttle: clean up codes that can't be reached While doing code coverage testing while CONFIG_BLK_DEV_THROTTLING_LOW is disabled, we found that there are many codes can never be reached. This patch move such codes inside "#ifdef CONFIG_BLK_DEV_THROTTLING_LOW". Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220903062826.1099085-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 90 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9f5fe62afff9..667b2958471a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1673,6 +1673,40 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_free_fn = throtl_pd_free, }; +void blk_throtl_cancel_bios(struct request_queue *q) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) { unsigned long rtime = jiffies, wtime = jiffies; @@ -1777,39 +1811,6 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } -void blk_throtl_cancel_bios(struct request_queue *q) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - spin_lock_irq(&q->queue_lock); - /* - * queue_lock is held, rcu lock is not needed here technically. - * However, rcu lock is still held to emphasize that following - * path need RCU protection and to prevent warning from lockdep. - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. - */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); - } - rcu_read_unlock(); - spin_unlock_irq(&q->queue_lock); -} - static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { @@ -2005,7 +2006,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) tg->checked_last_finish_time = last_finish_time; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; @@ -2086,6 +2086,28 @@ static void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td) { } + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + return false; +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ +} #endif bool __blk_throtl_bio(struct bio *bio) -- cgit From 91e5adda5cf4e3bf21c46eaa2ae7ee2cb6058126 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 5 Sep 2022 14:32:53 +0800 Subject: block/blk-map: Remove set but unused variable 'added' The variable added is not effectively used in the function, so delete it. block/blk-map.c:273:16: warning: variable 'added' set but not used. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2049 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220905063253.120082-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-map.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index f3768876d618..7693f8e3c454 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -270,7 +270,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; - size_t offs, added = 0; + size_t offs; int npages; if (nr_vecs <= ARRAY_SIZE(stack_pages)) { @@ -306,7 +306,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - added += n; bytes -= n; offs = 0; } -- cgit From 6d5e8d21e8997b0efa409e46db22a27b5cbba6aa Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 5 Sep 2022 18:19:50 +0800 Subject: blk-mq: remove unneeded needs_restart check If code reaches here, needs_restart must be true. Remove this unneeded needs_restart check. No functional change intended. Signed-off-by: Miaohe Lin Link: https://lore.kernel.org/r/20220905101950.4606-1-linmiaohe@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4b90d2d8cfb0..29bb48de5bda 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1991,7 +1991,7 @@ out: if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && needs_resource) + else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); -- cgit From bdb7d420c6f6d2618d4c907cd7742c3195c425e2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 5 Sep 2022 18:27:54 +0800 Subject: block: remove unneeded return value of bio_check_ro() bio_check_ro() always return false now. Remove this unneeded return value and cleanup the sole caller. No functional change intended. Signed-off-by: Miaohe Lin Link: https://lore.kernel.org/r/20220905102754.1942-1-linmiaohe@huawei.com Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a0d1104c5590..fe6b27e3a513 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -487,18 +487,15 @@ static int __init fail_make_request_debugfs(void) late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio) +static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; + return; pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); /* Older lvm-tools actually trigger this */ - return false; } - - return false; } static noinline int should_fail_bio(struct bio *bio) @@ -722,8 +719,7 @@ void submit_bio_noacct(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - if (unlikely(bio_check_ro(bio))) - goto end_io; + bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; -- cgit From 4acb83417cadfdcbe64215f9d0ddcf3132af808e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 Sep 2022 11:40:22 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220909184022.1709476-1-kbusch@fb.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8e3b36d1cb57..9eb968e14d31 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev); + sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); -- cgit From 320fb0f91e55ba248d4bad106b408e59099cfa89 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:37 +0800 Subject: blk-throttle: fix that io throttle can only work for single bio Test scripts: cd /sys/fs/cgroup/blkio/ echo "8:0 1024" > blkio.throttle.write_bps_device echo $$ > cgroup.procs dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & Test result: 10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s 10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s The problem is that the second bio is finished after 10s instead of 20s. Root cause: 1) second bio will be flagged: __blk_throtl_bio while (true) { ... if (sq->nr_queued[rw]) -> some bio is throttled already break }; bio_set_flag(bio, BIO_THROTTLED); -> flag the bio 2) flagged bio will be dispatched without waiting: throtl_dispatch_tg tg_may_dispatch tg_with_in_bps_limit if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) *wait = 0; -> wait time is zero return true; commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") support to count split bios for iops limit, thus it adds flagged bio checking in tg_with_in_bps_limit() so that split bios will only count once for bps limit, however, it introduce a new problem that io throttle won't work if multiple bios are throttled. In order to fix the problem, handle iops/bps limit in different ways: 1) for iops limit, there is no flag to record if the bio is throttled, and iops is always applied. 2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED, and io throttle will ignore bio with the flag. Noted this patch also remove the code to set flag in __bio_clone(), it's introduced in commit 111be8839817 ("block-throttle: avoid double charge"), and author thinks split bio can be resubmited and throttled again, which is wrong because split bio will continue to dispatch from caller. Fixes: 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") Cc: Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bio.c | 2 -- block/blk-throttle.c | 20 ++++++-------------- block/blk-throttle.h | 2 +- 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d3154d8beed7..c88459a9507d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -760,8 +760,6 @@ EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 667b2958471a..a5e495b67827 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -811,7 +811,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { if (wait) *wait = 0; return true; @@ -921,22 +921,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; - - /* - * BIO_THROTTLED is used to prevent the same bio to be throttled - * more than once as a throttled bio will go through blk-throtl the - * second time when it eventually gets issued. Set it when a bio - * is being charged to a tg. - */ - if (!bio_flagged(bio, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); } /** @@ -1026,6 +1017,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); + bio_set_flag(bio, BIO_BPS_THROTTLED); /* * If our parent is another tg, we just need to transfer @bio to @@ -2181,8 +2173,10 @@ again: qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); - if (!tg) + if (!tg) { + bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; + } } /* out-of-limit, queue to @tg */ @@ -2211,8 +2205,6 @@ again: } out_unlock: - bio_set_flag(bio, BIO_THROTTLED); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index c1b602996127..ee7299e6dea9 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -175,7 +175,7 @@ static inline bool blk_throtl_bio(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); /* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_THROTTLED) && + if (bio_flagged(bio, BIO_BPS_THROTTLED) && !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) return false; -- cgit From 8d6bbaada2e0a65f9012ac4c2506460160e7237a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:38 +0800 Subject: blk-throttle: prevent overflow while calculating wait time There is a problem found by code review in tg_with_in_bps_limit() that 'bps_limit * jiffy_elapsed_rnd' might overflow. Fix the problem by calling mul_u64_u64_div_u64() instead. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a5e495b67827..353bbd0886ca 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -806,7 +806,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; + u64 bytes_allowed, extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); @@ -824,10 +824,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - - tmp = bps_limit * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; + bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd, + (u64)HZ); if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) -- cgit From 681cd46fff8cd81e387747c7850f2e730d3e0b74 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:39 +0800 Subject: blk-throttle: factor out code to calculate ios/bytes_allowed No functional changes, new apis will be used in later patches to calculate wait time for throttled bios when new configuration is submitted. Noted this patch also rename tg_with_in_iops/bps_limit() to tg_within_iops/bps_limit(). Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 59 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 353bbd0886ca..89e76d52ee56 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -754,33 +754,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw], tg->slice_end[rw], jiffies); } -static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned int calculate_io_allowed(u32 iops_limit, + unsigned long jiffy_elapsed) { - bool rw = bio_data_dir(bio); unsigned int io_allowed; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; - } - - jiffy_elapsed = jiffies - tg->slice_start[rw]; - - /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - /* - * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ - tmp = (u64)iops_limit * jiffy_elapsed_rnd; + tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -788,6 +775,32 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, else io_allowed = tmp; + return io_allowed; +} + +static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) +{ + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); +} + +static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + + jiffy_elapsed = jiffies - tg->slice_start[rw]; + + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; @@ -802,8 +815,8 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, return false; } -static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes; @@ -824,9 +837,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd, - (u64)HZ); - + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; @@ -895,8 +906,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { + if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) *wait = 0; return true; -- cgit From a880ae93e5b5bb5d8d5500077a391e3f5ec7715c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:40 +0800 Subject: blk-throttle: fix io hung due to configuration updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If new configuration is submitted while a bio is throttled, then new waiting time is recalculated regardless that the bio might already wait for some time: tg_conf_updated throtl_start_new_slice tg_update_disptime throtl_schedule_next_dispatch Then io hung can be triggered by always submmiting new configuration before the throttled bio is dispatched. Fix the problem by respecting the time that throttled bio already waited. In order to do that, add new fields to record how many bytes/io are waited, and use it to calculate wait time for throttled bio under new configuration. Some simple test: 1) cd /sys/fs/cgroup/blkio/ echo $$ > cgroup.procs echo "8:0 2048" > blkio.throttle.write_bps_device { sleep 2 echo "8:0 1024" > blkio.throttle.write_bps_device } & dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct 2) cd /sys/fs/cgroup/blkio/ echo $$ > cgroup.procs echo "8:0 1024" > blkio.throttle.write_bps_device { sleep 4 echo "8:0 2048" > blkio.throttle.write_bps_device } & dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct test results: io finish time before this patch with this patch 1) 10s 6s 2) 8s 6s Signed-off-by: Yu Kuai Reviewed-by: Michal Koutný Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++------ block/blk-throttle.h | 9 ++++++++ 2 files changed, 61 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 89e76d52ee56..d392f355977e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -639,6 +639,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -656,12 +658,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->slice_end[rw], jiffies); } -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, + bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + if (clear_carryover) { + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; + } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -783,6 +790,41 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } +static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +{ + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); + + /* + * If config is updated while bios are still throttled, calculate and + * accumulate how many bytes/ios are waited across changes. And + * carryover_bytes/ios will be used to calculate new wait time under new + * configuration. + */ + if (bps_limit != U64_MAX) + tg->carryover_bytes[rw] += + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + tg->bytes_disp[rw]; + if (iops_limit != UINT_MAX) + tg->carryover_ios[rw] += + calculate_io_allowed(iops_limit, jiffy_elapsed) - + tg->io_disp[rw]; +} + +static void tg_update_carryover(struct throtl_grp *tg) +{ + if (tg->service_queue.nr_queued[READ]) + __tg_update_carryover(tg, READ); + if (tg->service_queue.nr_queued[WRITE]) + __tg_update_carryover(tg, WRITE); + + /* see comments in struct throtl_grp for meaning of these fields. */ + throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); +} + static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit, unsigned long *wait) { @@ -800,7 +842,8 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; @@ -837,7 +880,8 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; @@ -898,7 +942,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw); + throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) @@ -1322,8 +1366,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_start_new_slice(tg, READ, false); + throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1351,6 +1395,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; @@ -1537,6 +1582,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, return ret; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index ee7299e6dea9..66b4292b1b92 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -121,6 +121,15 @@ struct throtl_grp { uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2]; + /* + * The following two fields are updated when new configuration is + * submitted while some bios are still throttled, they record how many + * bytes/ios are waited already in previous configuration, and they will + * be used to calculate wait time under new configuration. + */ + uint64_t carryover_bytes[2]; + unsigned int carryover_ios[2]; + unsigned long last_check_time; unsigned long latency_target; /* us */ -- cgit From 7e9c5c54d440bd6402ffdba4dc4f3df5bfe64ea4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 Aug 2022 18:16:35 +0800 Subject: blk-throttle: use 'READ/WRITE' instead of '0/1' Make the code easier to read, like everywhere else. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220827101637.1775111-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d392f355977e..27d46a7d309b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -329,8 +329,8 @@ static struct bio *throtl_pop_queued(struct list_head *queued, /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { - INIT_LIST_HEAD(&sq->queued[0]); - INIT_LIST_HEAD(&sq->queued[1]); + INIT_LIST_HEAD(&sq->queued[READ]); + INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } @@ -1151,7 +1151,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[0] || sq->nr_queued[1]) + if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); if (nr_disp >= THROTL_QUANTUM) -- cgit From 8c25ed0cb9d2e349ebebfeacf7ce1ae015afe54d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 Aug 2022 18:16:36 +0800 Subject: blk-throttle: calling throtl_dequeue/enqueue_tg in pairs It's a little weird to call throtl_dequeue_tg() unconditionally in throtl_select_dispatch(), since it will be called in tg_update_disptime() again if some bio is still throttled. Thus call it later if there are no throttled bio. There are no functional changes. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220827101637.1775111-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 27d46a7d309b..4c12df8f6bae 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1146,13 +1146,13 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) if (time_before(jiffies, tg->disptime)) break; - throtl_dequeue_tg(tg); - nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); + else + throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; -- cgit From c013710e1a7eba8e33da9380a068fe1cec017226 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 Aug 2022 18:16:37 +0800 Subject: blk-throttle: cleanup tg_update_disptime() tg_update_disptime() only need to adjust postion for 'tg' in 'parent_sq', there is no need to call throtl_enqueue/dequeue_tg(), since they will set/clear flag THROTL_TG_PENDING and increase/decrease nr_pending, which is useless. By the way, clear the flag/decrease nr_pending while there are still throttled bios is not good for debugging. There are no functional changes. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220827101637.1775111-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4c12df8f6bae..55f2d985cfbb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -520,7 +520,6 @@ static void throtl_rb_erase(struct rb_node *n, { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); - --parent_sq->nr_pending; } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) @@ -572,7 +571,11 @@ static void throtl_enqueue_tg(struct throtl_grp *tg) static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + struct throtl_service_queue *parent_sq = + tg->service_queue.parent_sq; + + throtl_rb_erase(&tg->rb_node, parent_sq); + --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } @@ -1034,9 +1037,9 @@ static void tg_update_disptime(struct throtl_grp *tg) disptime = jiffies + min_wait; /* Update dispatch time */ - throtl_dequeue_tg(tg); + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; - throtl_enqueue_tg(tg); + tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; -- cgit From e88480871b8d5a6bd14be2817063363202d282b9 Mon Sep 17 00:00:00 2001 From: Ping-Xiang Chen Date: Wed, 14 Sep 2022 00:42:37 -0700 Subject: block: fix comment typo in submit_bio of block-core.c. This patch fix a comment typo in block-core.c. Signed-off-by: Ping-Xiang Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220914074237.31621-1-p.x.chen@uci.edu Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index fe6b27e3a513..fa609569c3d2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -810,7 +810,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) -- cgit From 118f3663fbc658e9ad6165e129076981c7b685c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:42:00 +0100 Subject: block: remove PSI accounting from the bio layer PSI accounting is now done by the VM code, where it should have been since the beginning. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 8 -------- block/blk-core.c | 17 ----------------- 2 files changed, 25 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c88459a9507d..7cb7d2ff139b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1063,9 +1063,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1274,9 +1271,6 @@ out: * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1292,8 +1286,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/block/blk-core.c b/block/blk-core.c index fa609569c3d2..052444c9b594 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -825,22 +824,6 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - - psi_memstall_enter(&pflags); - submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - return; - } - submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); -- cgit From a7609c68f7a117a77b3049c2353f555854be69e9 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 19 Sep 2022 09:28:25 +0800 Subject: blk-iocost: Remove unnecessary (void*) conversions The key pointer is void and hence does not need an explicit cast. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20220919012825.2936-1-zeming@nfschina.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7936e5f5821c..b473efd89b86 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1430,7 +1430,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; -- cgit From 8c5035dfbb9475b67c82b3fdb7351236525bf52b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 13 Sep 2022 18:57:49 +0800 Subject: blk-wbt: call rq_qos_add() after wb_normal is initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our test found a problem that wbt inflight counter is negative, which will cause io hang(noted that this problem doesn't exist in mainline): t1: device create t2: issue io add_disk blk_register_queue wbt_enable_default wbt_init rq_qos_add // wb_normal is still 0 /* * in mainline, disk can't be opened before * bdev_add(), however, in old kernels, disk * can be opened before blk_register_queue(). */ blkdev_issue_flush // disk size is 0, however, it's not checked submit_bio_wait submit_bio blk_mq_submit_bio rq_qos_throttle wbt_wait bio_to_wbt_flags rwb_enabled // wb_normal is 0, inflight is not increased wbt_queue_depth_changed(&rwb->rqos); wbt_update_limits // wb_normal is initialized rq_qos_track wbt_track rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); // wb_normal is not 0,wbt_flags will be set t3: io completion blk_mq_free_request rq_qos_done wbt_done wbt_is_tracked // return true __wbt_done wbt_rqw_done atomic_dec_return(&rqw->inflight); // inflight is decreased commit 8235b5c1e8c1 ("block: call bdev_add later in device_add_disk") can avoid this problem, however it's better to fix this problem in wbt: 1) Lower kernel can't backport this patch due to lots of refactor. 2) Root cause is that wbt call rq_qos_add() before wb_normal is initialized. Fixes: e34cbd307477 ("blk-wbt: add general throttling mechanism") Cc: Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20220913105749.3086243-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a9982000b667..246467926253 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -843,6 +843,10 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_queue_depth_changed(&rwb->rqos); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); /* * Assign rwb and add the stats callback. @@ -853,11 +857,6 @@ int wbt_init(struct request_queue *q) blk_stat_add_callback(q, rwb->cb); - rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); - return 0; err_free: -- cgit From 9713a67067897a9e372c52124f72f8e00b2e6031 Mon Sep 17 00:00:00 2001 From: Li Jinlin Date: Fri, 16 Sep 2022 10:32:41 +0800 Subject: block/blk-rq-qos: delete useless enmu RQ_QOS_IOPRIO Since blk-ioprio handing was converted from a rqos policy to a direct call, RQ_QOS_IOPRIO is not used anymore, just delete it. Signed-off-by: Li Jinlin Reviewed-by: Jan Kara Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220916023241.32926-1-lijinlin3@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 -- block/blk-rq-qos.h | 1 - 2 files changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 8559cea7f300..2bb6f8cdc3b5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -805,8 +805,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; - case RQ_QOS_IOPRIO: - return "ioprio"; } return "unknown"; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 08b856570ad1..1ef1f7d4bc3c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,7 +17,6 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, - RQ_QOS_IOPRIO, }; struct rq_wait { -- cgit From 85496749904016f36b69332f73a1cf3ecfee828f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Sep 2022 17:53:08 +0800 Subject: blk-throttle: remove THROTL_TG_HAS_IOPS_LIMIT Currently, "tg->has_rules" and "tg->flags & THROTL_TG_HAS_IOPS_LIMIT" both try to bypass bios that don't need to be throttled, however, they are a little redundant and both not perfect: 1) "tg->has_rules" only distinguish read and write, but not iops and bps limit. 2) "tg->flags & THROTL_TG_HAS_IOPS_LIMIT" only check if iops limit exist, read and write is not distinguished, and bps limit is not checked. tg->has_rules will extended to distinguish bps and iops in the following patch. There is no need to keep the flag. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921095309.1481289-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 ++-------------- block/blk-throttle.h | 8 +------- 2 files changed, 3 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 55f2d985cfbb..a062539d84d0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -420,24 +420,12 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; - int has_iops_limit = 0; - - for (rw = READ; rw <= WRITE; rw++) { - unsigned int iops_limit = tg_iops_limit(tg, rw); + for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || (td->limit_valid[td->limit_index] && (tg_bps_limit(tg, rw) != U64_MAX || - iops_limit != UINT_MAX)); - - if (iops_limit != UINT_MAX) - has_iops_limit = 1; - } - - if (has_iops_limit) - tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; - else - tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; + tg_iops_limit(tg, rw) != UINT_MAX)); } static void throtl_pd_online(struct blkg_policy_data *pd) diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 66b4292b1b92..3994b89dfa11 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -55,8 +55,7 @@ struct throtl_service_queue { enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ - THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ + THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; enum { @@ -183,11 +182,6 @@ static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); - /* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_BPS_THROTTLED) && - !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) - return false; - if (!tg->has_rules[bio_data_dir(bio)]) return false; -- cgit From 81c7a63abc7c0be572b4f853e913ce93a34f6e1b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Sep 2022 17:53:09 +0800 Subject: blk-throttle: improve bypassing bios checkings "tg->has_rules" is extended to "tg->has_rules_iops/bps", thus bios that don't need to be throttled can be checked accurately. With this patch, bio will be throttled if: 1) Bio is read/write, and corresponding read/write iops limit exist. 2) If corresponding doesn't exist, corresponding bps limit exist and bio is not throttled before. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921095309.1481289-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 13 +++++++++---- block/blk-throttle.h | 22 +++++++++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a062539d84d0..78316955e30f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -421,11 +421,16 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_data *td = tg->td; int rw; - for (rw = READ; rw <= WRITE; rw++) - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + for (rw = READ; rw <= WRITE; rw++) { + tg->has_rules_iops[rw] = + (parent_tg && parent_tg->has_rules_iops[rw]) || (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + tg_iops_limit(tg, rw) != UINT_MAX); + tg->has_rules_bps[rw] = + (parent_tg && parent_tg->has_rules_bps[rw]) || + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX)); + } } static void throtl_pd_online(struct blkg_policy_data *pd) diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 3994b89dfa11..69f00012d616 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -98,7 +98,8 @@ struct throtl_grp { unsigned int flags; /* are there any throtl rules between this group and td? */ - bool has_rules[2]; + bool has_rules_bps[2]; + bool has_rules_iops[2]; /* internally used bytes per second rate limits */ uint64_t bps[2][LIMIT_CNT]; @@ -178,11 +179,26 @@ void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct request_queue *q); -static inline bool blk_throtl_bio(struct bio *bio) + +static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + int rw = bio_data_dir(bio); + + /* iops limit is always counted */ + if (tg->has_rules_iops[rw]) + return true; + + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) + return true; + + return false; +} + +static inline bool blk_throtl_bio(struct bio *bio) +{ - if (!tg->has_rules[bio_data_dir(bio)]) + if (!blk_should_throtl(bio)) return false; return __blk_throtl_bio(bio); -- cgit From f168420c62e7a106961f2489a89f6ade84fe3f27 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Wed, 21 Sep 2022 11:32:03 +0800 Subject: blk-mq: don't redirect completion for hctx withs only one ctx mapping High-performance NVMe devices usually support a large hw queues, which ensures a 1:1 mapping of hctx and ctx. In this case there will be no remote request, so we don't need to care about it. Signed-off-by: Liu Song Link: https://lore.kernel.org/r/1663731123-81536-1-git-send-email-liusong@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 29bb48de5bda..496085132899 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1093,10 +1093,12 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locally, it's pointless - * to redirect the completion. + * For request which hctx has only one ctx mapping, + * or a polled request, always complete locally, + * it's pointless to redirect the completion. */ - if (rq->cmd_flags & REQ_POLLED) + if (rq->mq_hctx->nr_ctx == 1 || + rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { -- cgit From 33dc62796cb657a633050138a86253fb2a553713 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:45 +0200 Subject: blk-cgroup: fix error unwinding in blkcg_init_queue When blk_throtl_init fails, we need to call blk_ioprio_exit. Switch to proper goto based unwinding to fix this. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 869af9d72bcf..3a88f8c011d2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1297,17 +1297,18 @@ int blkcg_init_queue(struct request_queue *q) ret = blk_throtl_init(q); if (ret) - goto err_destroy_all; + goto err_ioprio_exit; ret = blk_iolatency_init(q); - if (ret) { - blk_throtl_exit(q); - blk_ioprio_exit(q); - goto err_destroy_all; - } + if (ret) + goto err_throtl_exit; return 0; +err_throtl_exit: + blk_throtl_exit(q); +err_ioprio_exit: + blk_ioprio_exit(q); err_destroy_all: blkg_destroy_all(q); return ret; -- cgit From 928f6f00a91ecbef6cb1fe59474831ceaf205290 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:46 +0200 Subject: blk-cgroup: remove blk_queue_root_blkg Just open code it in the only caller and drop the unused !BLK_CGROUP stub. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 +-- block/blk-cgroup.h | 13 ------------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3a88f8c011d2..4180de4cbb3e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -915,8 +915,7 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); - struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev_get_queue(bdev)); + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d2724d1dd7c9..c1fb00a1dfc0 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -268,17 +268,6 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, return __blkg_lookup(blkcg, q, false); } -/** - * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair - * @q: request_queue of interest - * - * Lookup blkg for @q at the root level. See also blkg_lookup(). - */ -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ - return q->root_blkg; -} - /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -507,8 +496,6 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_exit_queue(struct request_queue *q) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -- cgit From 79fcc5be93e5b17a2a5d36553f7a5c1ad9e953b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:47 +0200 Subject: blk-cgroup: remove open coded blkg_lookup instances Use blkg_lookup instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +++--- block/blk-cgroup.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4180de4cbb3e..b9a1dcee5a24 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -324,7 +324,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), q); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -412,7 +412,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { - blkg = __blkg_lookup(parent, q, false); + blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -724,7 +724,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index c1fb00a1dfc0..30396cad50e9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -362,8 +362,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants @@ -377,8 +377,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) bool __blkcg_punt_bio_submit(struct bio *bio); -- cgit From 4a69f325aa43847e0827fbfe4b3623307b0c9baa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:48 +0200 Subject: blk-cgroup: cleanup the blkg_lookup family of functions Add a fully inlined blkg_lookup as the extra two checks aren't going to generated a lot more code vs the call to the slowpath routine, and open code the hint update in the two callers that care. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 38 +++++++++++++++----------------------- block/blk-cgroup.h | 39 ++++++++++++--------------------------- 2 files changed, 27 insertions(+), 50 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b9a1dcee5a24..d1216760d025 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -263,29 +263,13 @@ err_free: return NULL; } -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint) +static void blkg_update_hint(struct blkcg *blkcg, struct blkcg_gq *blkg) { - struct blkcg_gq *blkg; - - /* - * Hint didn't match. Look up from the radix tree. Note that the - * hint can only be updated under queue_lock as otherwise @blkg - * could have already been removed from blkg_tree. The caller is - * responsible for grabbing queue_lock if @update_hint. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) { - if (update_hint) { - lockdep_assert_held(&q->queue_lock); - rcu_assign_pointer(blkcg->blkg_hint, blkg); - } - return blkg; - } + lockdep_assert_held(&blkg->q->queue_lock); - return NULL; + if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); } -EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as @@ -397,9 +381,11 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup(blkcg, q, true); - if (blkg) + blkg = blkg_lookup(blkcg, q); + if (blkg) { + blkg_update_hint(blkcg, blkg); goto found; + } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -621,12 +607,18 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, const struct blkcg_policy *pol, struct request_queue *q) { + struct blkcg_gq *blkg; + WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); + + blkg = blkg_lookup(blkcg, q); + if (blkg) + blkg_update_hint(blkcg, blkg); + return blkg; } /** diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 30396cad50e9..91b7ae0773be 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -178,8 +178,6 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint); int blkcg_init_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -227,22 +225,21 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) } /** - * __blkg_lookup - internal version of blkg_lookup() + * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. + * Lookup blkg for the @blkcg - @q pair. + + * Must be called in a RCU critical section. */ -static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q, - bool update_hint) +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + WARN_ON_ONCE(!rcu_read_lock_held()); + if (blkcg == &blkcg_root) return q->root_blkg; @@ -250,22 +247,10 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, if (blkg && blkg->q == q) return blkg; - return blkg_lookup_slowpath(blkcg, q, update_hint); -} - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock. - */ -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __blkg_lookup(blkcg, q, false); + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q != q) + blkg = NULL; + return blkg; } /** -- cgit From f753526e327bc849c445c084d0f374e992038ae9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:49 +0200 Subject: blk-cgroup: remove blkg_lookup_check The combinations of an error check with an ERR_PTR return and a lookup with a NULL return leads to ugly handling of the return values in the callers. Just open coding the check and the lookup is much simpler. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d1216760d025..1306112d7648 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -602,25 +602,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) -{ - struct blkcg_gq *blkg; - - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - - blkg = blkg_lookup(blkcg, q); - if (blkg) - blkg_update_hint(blkcg, blkg); - return blkg; -} - /** * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer @@ -697,14 +678,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(blkcg, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { + ret = -EOPNOTSUPP; goto fail_unlock; } - if (blkg) + blkg = blkg_lookup(blkcg, q); + if (blkg) { + blkg_update_hint(blkcg, blkg); goto success; + } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -740,14 +723,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(pos, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); + ret = -EOPNOTSUPP; goto fail_preloaded; } + blkg = blkg_lookup(pos, q); if (blkg) { + blkg_update_hint(pos, blkg); blkg_free(new_blkg); } else { blkg = blkg_create(pos, q, new_blkg); -- cgit From 9823538fb7efe66ce987a1e4c0e0f3dc882623c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:50 +0200 Subject: blk-cgroup: pass a gendisk to blkcg_init_queue and blkcg_exit_queue Pass the gendisk to blkcg_init_disk and blkcg_exit_disk as part of moving the blk-cgroup infrastructure to be gendisk based. Also remove the rather pointless kerneldoc comments for these internal functions with a single caller each. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 25 +++++-------------------- block/blk-cgroup.h | 8 ++++---- block/genhd.c | 5 +++-- 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1306112d7648..4ca6933a7c3f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1230,18 +1230,9 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from blk_alloc_queue(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +int blkcg_init_disk(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; @@ -1294,16 +1285,10 @@ err_unlock: return PTR_ERR(blkg); } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_exit_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) +void blkcg_exit_disk(struct gendisk *disk) { - blkg_destroy_all(q); - blk_throtl_exit(q); + blkg_destroy_all(disk->queue); + blk_throtl_exit(disk->queue); } static void blkcg_bind(struct cgroup_subsys_state *root_css) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 91b7ae0773be..aa2b286bc825 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -178,8 +178,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; -int blkcg_init_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); +int blkcg_init_disk(struct gendisk *disk); +void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); @@ -481,8 +481,8 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } +static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, diff --git a/block/genhd.c b/block/genhd.c index d36fabf0abc1..f1af045fac2f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1150,7 +1150,8 @@ static void disk_release(struct device *dev) !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); - blkcg_exit_queue(disk->queue); + blkcg_exit_disk(disk); + bioset_exit(&disk->bio_split); disk_release_events(disk); @@ -1363,7 +1364,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - if (blkcg_init_queue(q)) + if (blkcg_init_disk(disk)) goto out_erase_part0; rand_initialize_disk(disk); -- cgit From b0dde3f5d628f76f461fb650e2cebfac3460cff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:51 +0200 Subject: blk-ioprio: pass a gendisk to blk_ioprio_init and blk_ioprio_exit Pass the gendisk to blk_ioprio_init and blk_ioprio_exit as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- block/blk-ioprio.c | 8 ++++---- block/blk-ioprio.h | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4ca6933a7c3f..89974fd0db3d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1257,7 +1257,7 @@ int blkcg_init_disk(struct gendisk *disk) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(q); + ret = blk_ioprio_init(disk); if (ret) goto err_destroy_all; @@ -1274,7 +1274,7 @@ int blkcg_init_disk(struct gendisk *disk) err_throtl_exit: blk_throtl_exit(q); err_ioprio_exit: - blk_ioprio_exit(q); + blk_ioprio_exit(disk); err_destroy_all: blkg_destroy_all(q); return ret; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index c00060a02c6e..8bb6b8eba4ce 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -202,14 +202,14 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct request_queue *q) +void blk_ioprio_exit(struct gendisk *disk) { - blkcg_deactivate_policy(q, &ioprio_policy); + blkcg_deactivate_policy(disk->queue, &ioprio_policy); } -int blk_ioprio_init(struct request_queue *q) +int blk_ioprio_init(struct gendisk *disk) { - return blkcg_activate_policy(q, &ioprio_policy); + return blkcg_activate_policy(disk->queue, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index 5a1eb550e178..b6afb8e80de0 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,15 +9,15 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct request_queue *q); -void blk_ioprio_exit(struct request_queue *q); +int blk_ioprio_init(struct gendisk *disk); +void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct request_queue *q) +static inline int blk_ioprio_init(struct gendisk *disk) { return 0; } -static inline void blk_ioprio_exit(struct request_queue *q) +static inline void blk_ioprio_exit(struct gendisk *disk) { } static inline void blkcg_set_ioprio(struct bio *bio) -- cgit From 16fac1b5912b778a30d8863dbc928bef25c8d307 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:52 +0200 Subject: blk-iolatency: pass a gendisk to blk_iolatency_init Pass the gendisk to blk_iolatency_init as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-9-hch@lst.de [axboe: missed inline for blk_iolatency_init() and !CONFIG_BLK_CGROUP_IOLATENCY] Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-iolatency.c | 3 ++- block/blk.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 89974fd0db3d..82a117ff54de 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1265,7 +1265,7 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_ioprio_exit; - ret = blk_iolatency_init(q); + ret = blk_iolatency_init(disk); if (ret) goto err_throtl_exit; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index e285152345a2..c6f61fe88b87 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -756,8 +756,9 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) } } -int blk_iolatency_init(struct request_queue *q) +int blk_iolatency_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; struct rq_qos *rqos; int ret; diff --git a/block/blk.h b/block/blk.h index d7142c4d2fef..9f714c942d32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -389,9 +389,9 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, } #ifdef CONFIG_BLK_CGROUP_IOLATENCY -extern int blk_iolatency_init(struct request_queue *q); +int blk_iolatency_init(struct gendisk *disk); #else -static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; #endif #ifdef CONFIG_BLK_DEV_ZONED -- cgit From 9df3e65139b923dfe98f76b7057882c7afb2d3e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:53 +0200 Subject: blk-iocost: simplify ioc_name Just directly dereference the disk name instead of going through multiple hoops to find the same value. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-iocost.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index b473efd89b86..d0d9b2d17508 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -664,17 +664,13 @@ static struct ioc *q_to_ioc(struct request_queue *q) return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } -static const char *q_name(struct request_queue *q) -{ - if (blk_queue_registered(q)) - return kobject_name(q->kobj.parent); - else - return ""; -} - static const char __maybe_unused *ioc_name(struct ioc *ioc) { - return q_name(ioc->rqos.q); + struct gendisk *disk = ioc->rqos.q->disk; + + if (!disk) + return ""; + return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) -- cgit From 57b64554977e28ab84d33d298032872a8047a557 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:54 +0200 Subject: blk-iocost: pass a gendisk to blk_iocost_init Pass the gendisk to blk_iocost_init as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d0d9b2d17508..6fb1c24b3504 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2828,8 +2828,9 @@ static struct rq_qos_ops ioc_rqos_ops = { .exit = ioc_rqos_exit, }; -static int blk_iocost_init(struct request_queue *q) +static int blk_iocost_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct ioc *ioc; struct rq_qos *rqos; int i, cpu, ret; @@ -3178,7 +3179,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(bdev_get_queue(bdev)); @@ -3345,7 +3346,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(bdev_get_queue(bdev)); if (!ioc) { - ret = blk_iocost_init(bdev_get_queue(bdev)); + ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(bdev_get_queue(bdev)); -- cgit From 3657647e33dff916a2d2d9df926d9bca3907d34f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:55 +0200 Subject: blk-iocost: cleanup ioc_qos_write Use a local disk variable instead of retrieving the disk and request_queue over and over by various means. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-12-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-iocost.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6fb1c24b3504..c0f69bc99db9 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3167,6 +3167,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct block_device *bdev; + struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; @@ -3177,12 +3178,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev_get_queue(bdev)); + disk = bdev->bd_disk; + ioc = q_to_ioc(disk->queue); if (!ioc) { - ret = blk_iocost_init(bdev->bd_disk); + ret = blk_iocost_init(disk); if (ret) goto err; - ioc = q_to_ioc(bdev_get_queue(bdev)); + ioc = q_to_ioc(disk->queue); } spin_lock_irq(&ioc->lock); @@ -3259,11 +3261,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); if (enable) { - blk_stat_enable_accounting(ioc->rqos.q); - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else { - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } -- cgit From e13793bae65919cd3e6a7827f8d30f4dbb8584ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:56 +0200 Subject: blk-throttle: pass a gendisk to blk_throtl_init and blk_throtl_exit Pass the gendisk to blk_throtl_init and blk_throtl_exit as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +++--- block/blk-throttle.c | 7 +++++-- block/blk-throttle.h | 8 ++++---- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 82a117ff54de..3dfd78f1312d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1261,7 +1261,7 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_destroy_all; - ret = blk_throtl_init(q); + ret = blk_throtl_init(disk); if (ret) goto err_ioprio_exit; @@ -1272,7 +1272,7 @@ int blkcg_init_disk(struct gendisk *disk) return 0; err_throtl_exit: - blk_throtl_exit(q); + blk_throtl_exit(disk); err_ioprio_exit: blk_ioprio_exit(disk); err_destroy_all: @@ -1288,7 +1288,7 @@ err_unlock: void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk->queue); - blk_throtl_exit(disk->queue); + blk_throtl_exit(disk); } static void blkcg_bind(struct cgroup_subsys_state *root_css) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 78316955e30f..4879aeca7509 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2351,8 +2351,9 @@ void blk_throtl_bio_endio(struct bio *bio) } #endif -int blk_throtl_init(struct request_queue *q) +int blk_throtl_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int ret; @@ -2394,8 +2395,10 @@ int blk_throtl_init(struct request_queue *q) return ret; } -void blk_throtl_exit(struct request_queue *q) +void blk_throtl_exit(struct gendisk *disk) { + struct request_queue *q = disk->queue; + BUG_ON(!q->td); del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 69f00012d616..1c9def91088f 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -168,14 +168,14 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } +static inline int blk_throtl_init(struct gendisk *disk) { return 0; } +static inline void blk_throtl_exit(struct gendisk *disk) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct request_queue *q) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct request_queue *q); -void blk_throtl_exit(struct request_queue *q); +int blk_throtl_init(struct gendisk *disk); +void blk_throtl_exit(struct gendisk *disk); void blk_throtl_register_queue(struct request_queue *q); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct request_queue *q); -- cgit From 5f6dc7522ac2e1701c92f20b9a1a664736787728 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:57 +0200 Subject: blk-throttle: pass a gendisk to blk_throtl_register_queue Pass the gendisk to blk_throtl_register_queue as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 +- block/blk-throttle.c | 3 ++- block/blk-throttle.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e1f009aba6fd..e71b3b43927c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -844,7 +844,7 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); - blk_throtl_register_queue(q); + blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&q->kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4879aeca7509..def2775fa38e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2408,8 +2408,9 @@ void blk_throtl_exit(struct gendisk *disk) kfree(q->td); } -void blk_throtl_register_queue(struct request_queue *q) +void blk_throtl_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int i; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 1c9def91088f..69b7f05314e4 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -170,13 +170,13 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) #ifndef CONFIG_BLK_DEV_THROTTLING static inline int blk_throtl_init(struct gendisk *disk) { return 0; } static inline void blk_throtl_exit(struct gendisk *disk) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct request_queue *q) { } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct gendisk *disk); void blk_throtl_exit(struct gendisk *disk); -void blk_throtl_register_queue(struct request_queue *q); +void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct request_queue *q); -- cgit From cad9266abcef586aa95f6f4095781e3e55473f2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:58 +0200 Subject: blk-throttle: pass a gendisk to blk_throtl_cancel_bios Pass the gendisk to blk_throtl_cancel_bios as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-15-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-throttle.c | 3 ++- block/blk-throttle.h | 4 ++-- block/genhd.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index def2775fa38e..847721dc2b2b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1716,8 +1716,9 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_free_fn = throtl_pd_free, }; -void blk_throtl_cancel_bios(struct request_queue *q) +void blk_throtl_cancel_bios(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 69b7f05314e4..ef4b7a4de987 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -172,13 +172,13 @@ static inline int blk_throtl_init(struct gendisk *disk) { return 0; } static inline void blk_throtl_exit(struct gendisk *disk) { } static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } -static inline void blk_throtl_cancel_bios(struct request_queue *q) { } +static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct gendisk *disk); void blk_throtl_exit(struct gendisk *disk); void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); -void blk_throtl_cancel_bios(struct request_queue *q); +void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_should_throtl(struct bio *bio) { diff --git a/block/genhd.c b/block/genhd.c index f1af045fac2f..d6a21803a57e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -626,7 +626,7 @@ void del_gendisk(struct gendisk *disk) pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_throtl_cancel_bios(disk->queue); + blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); -- cgit From 00ad6991bbae116b7c83f68754edd6f4d5e65e01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:04:59 +0200 Subject: blk-cgroup: pass a gendisk to blkg_destroy_all Pass the gendisk to blkg_destroy_all as part of moving the blk-cgroup infrastructure to be gendisk based. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-16-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3dfd78f1312d..c2d5ca2eb92e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -462,14 +462,9 @@ static void blkg_destroy(struct blkcg_gq *blkg) percpu_ref_kill(&blkg->refcnt); } -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. - */ -static void blkg_destroy_all(struct request_queue *q) +static void blkg_destroy_all(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; @@ -1276,7 +1271,7 @@ err_throtl_exit: err_ioprio_exit: blk_ioprio_exit(disk); err_destroy_all: - blkg_destroy_all(q); + blkg_destroy_all(disk); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); @@ -1287,7 +1282,7 @@ err_unlock: void blkcg_exit_disk(struct gendisk *disk) { - blkg_destroy_all(disk->queue); + blkg_destroy_all(disk); blk_throtl_exit(disk); } -- cgit From de185b56e8a62822d4e1cdb3e068b38ca709aa47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:05:00 +0200 Subject: blk-cgroup: pass a gendisk to blkcg_schedule_throttle Pass the gendisk to blkcg_schedule_throttle as part of moving the blk-cgroup infrastructure to be gendisk based. Remove the unused !BLK_CGROUP stub while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-17-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 +++++--- block/blk-iocost.c | 4 ++-- block/blk-iolatency.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c2d5ca2eb92e..fc82057db962 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1792,13 +1792,13 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @gendisk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1807,8 +1807,10 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { + struct request_queue *q = disk->queue; + if (unlikely(current->flags & PF_KTHREAD)) return; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c0f69bc99db9..495396425bad 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2636,7 +2636,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2737,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->q->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c6f61fe88b87..571fa95aafe9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q, use_memdelay); + blkcg_schedule_throttle(rqos->q->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are -- cgit From 99e603874366be1115b40ecbc0e25847186d84ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:05:01 +0200 Subject: blk-cgroup: pass a gendisk to the blkg allocation helpers Prepare for storing the blkcg information in the gendisk instead of the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-18-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 56 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fc82057db962..94af5f3f3620 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -202,19 +202,19 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with + * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; @@ -225,10 +225,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; - if (!blk_get_queue(q)) + if (!blk_get_queue(disk->queue)) goto err_free; - blkg->q = q; + blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); @@ -243,11 +243,11 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); if (!pd) goto err_free; @@ -275,17 +275,16 @@ static void blkg_update_hint(struct blkcg *blkcg, struct blkcg_gq *blkg) * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, - struct request_queue *q, +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; - lockdep_assert_held(&q->queue_lock); + lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ - if (blk_queue_dying(q)) { + if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } @@ -298,7 +297,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -308,7 +307,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = blkg_lookup(blkcg_parent(blkcg), q); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -326,10 +325,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* insert */ spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -358,19 +357,20 @@ err_free_blkg: /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and takes @q->queue_lock. + * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) + struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; @@ -408,7 +408,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, q, NULL); + blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; @@ -652,6 +652,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) { struct block_device *bdev; + struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; @@ -659,8 +660,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, bdev = blkcg_conf_open_bdev(&input); if (IS_ERR(bdev)) return PTR_ERR(bdev); - - q = bdev_get_queue(bdev); + disk = bdev->bd_disk; + q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab @@ -703,7 +704,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; @@ -729,7 +730,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg_update_hint(pos, blkg); blkg_free(new_blkg); } else { - blkg = blkg_create(pos, q, new_blkg); + blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; @@ -1234,7 +1235,7 @@ int blkcg_init_disk(struct gendisk *disk) INIT_LIST_HEAD(&q->blkg_list); - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -1243,7 +1244,7 @@ int blkcg_init_disk(struct gendisk *disk) /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); - blkg = blkg_create(&blkcg_root, q, new_blkg); + blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; @@ -1860,8 +1861,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), - bdev_get_queue(bio->bi_bdev)); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; -- cgit From 568ec936bf1384fc15873908c96a9aeb62536edb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 09:58:15 +0200 Subject: block: replace blk_queue_nowait with bdev_nowait Replace blk_queue_nowait with a bdev_nowait helpers that takes the block_device given that the I/O submission path should not have to look into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220927075815.269694-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 052444c9b594..dc1fa454ae30 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -713,7 +713,7 @@ void submit_bio_noacct(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) -- cgit From 8237c01f1696bc53c470493bf1fe092a107648a6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Sep 2022 08:56:52 -0700 Subject: blk-mq: use quiesced elevator switch when reinitializing queues The hctx's run_work may be racing with the elevator switch when reinitializing hardware queues. The queue is merely frozen in this context, but that only prevents requests from allocating and doesn't stop the hctx work from running. The work may get an elevator pointer that's being torn down, and can result in use-after-free errors and kernel panics (example below). Use the quiesced elevator switch instead, and make the previous one static since it is now only used locally. nvme nvme0: resetting controller nvme nvme0: 32/0/0 default/read/poll queues BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 80000020c8861067 P4D 80000020c8861067 PUD 250f8c8067 PMD 0 Oops: 0000 [#1] SMP PTI Workqueue: kblockd blk_mq_run_work_fn RIP: 0010:kyber_has_work+0x29/0x70 ... Call Trace: __blk_mq_do_dispatch_sched+0x83/0x2b0 __blk_mq_sched_dispatch_requests+0x12e/0x170 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x2b/0x50 process_one_work+0x1ef/0x380 worker_thread+0x2d/0x3e0 Signed-off-by: Keith Busch Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220927155652.3260724-1-kbusch@fb.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- block/blk.h | 3 +-- block/elevator.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 496085132899..b2b318df4f69 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4472,14 +4472,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); /* - * After elevator_switch_mq, the previous elevator_queue will be + * After elevator_switch, the previous elevator_queue will be * released by elevator_release. The reference of the io scheduler * module get by elevator_get will also be put. So we need to get * a reference of the io scheduler module here to prevent it to be * removed. */ __module_get(qe->type->elevator_owner); - elevator_switch_mq(q, NULL); + elevator_switch(q, NULL); mutex_unlock(&q->sysfs_lock); return true; @@ -4511,7 +4511,7 @@ static void blk_mq_elv_switch_back(struct list_head *head, kfree(qe); mutex_lock(&q->sysfs_lock); - elevator_switch_mq(q, t); + elevator_switch(q, t); mutex_unlock(&q->sysfs_lock); } diff --git a/block/blk.h b/block/blk.h index 9f714c942d32..5350bf363035 100644 --- a/block/blk.h +++ b/block/blk.h @@ -270,8 +270,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); -int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e); +int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index c319765892bb..bd71f0fc4e4b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -588,7 +588,7 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -int elevator_switch_mq(struct request_queue *q, +static int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e) { int ret; @@ -723,7 +723,7 @@ void elevator_init_mq(struct request_queue *q) * need for the new one. this way we have a chance of going back to the old * one, if the new one fails init for some reason. */ -static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { int err; -- cgit From 5765033cf77c54897848df683420bb62b6cc3d05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 08:54:25 +0200 Subject: blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep blkg_conf_prep just creates a new blkg structure, there is no real need to update the lookup hint which should only be done on a successful lookup in the I/O path. Suggested-by: Tejun Heo Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220927065425.257876-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 94af5f3f3620..6a5c849ee061 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -263,14 +263,6 @@ err_free: return NULL; } -static void blkg_update_hint(struct blkcg *blkcg, struct blkcg_gq *blkg) -{ - lockdep_assert_held(&blkg->q->queue_lock); - - if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) - rcu_assign_pointer(blkcg->blkg_hint, blkg); -} - /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. @@ -383,7 +375,9 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { - blkg_update_hint(blkcg, blkg); + if (blkcg != &blkcg_root && + blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } @@ -680,10 +674,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } blkg = blkg_lookup(blkcg, q); - if (blkg) { - blkg_update_hint(blkcg, blkg); + if (blkg) goto success; - } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -727,7 +719,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_lookup(pos, q); if (blkg) { - blkg_update_hint(pos, blkg); blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); -- cgit From 8cafdb5ab94cda3ebb0975be16e2d564a05132ea Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 29 Sep 2022 09:47:44 +0200 Subject: block: adapt blk_mq_plug() to not plug for writes that require a zone lock The current implementation of blk_mq_plug() disables plugging for all operations that involves a transfer to the device as we just check if the last bit in op_is_write() function. Modify blk_mq_plug() to disable plugging only for REQ_OP_WRITE and REQ_OP_WRITE_ZEROS as they might require a zone lock. Suggested-by: Christoph Hellwig Suggested-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Pankaj Raghav Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220929074745.103073-2-p.raghav@samsung.com Signed-off-by: Jens Axboe --- block/blk-mq.h | 3 ++- block/blk-zoned.c | 9 +++------ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index 8ca453ac243d..0b2870839cdd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -312,7 +312,8 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) static inline struct blk_plug *blk_mq_plug( struct bio *bio) { /* Zoned block device write operation case: do not plug the BIO */ - if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) return NULL; /* diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a264621d4905..db829401d8d0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -63,13 +63,10 @@ bool blk_req_needs_zone_write_lock(struct request *rq) if (!rq->q->disk->seq_zones_wlock) return false; - switch (req_op(rq)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: + if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) return blk_rq_zone_is_seq(rq); - default: - return false; - } + + return false; } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); -- cgit From 110fdb4486d3a5e67d55bc6866d6426e6d49ccfc Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 29 Sep 2022 16:41:41 +0200 Subject: block: add rationale for not using blk_mq_plug() when applicable There are two places in the block layer at the moment where blk_mq_plug() helper could be used instead of directly accessing the plug from struct current. In both these cases, directly accessing the plug should not have any consequences for zoned devices. Make the intent explicit by adding comments instead of introducing unwanted checks with blk_mq_plug() helper.[1] [1] https://lore.kernel.org/linux-block/f6e54907-1035-2b2c-6387-ed178be05ccb@kernel.dk/ Signed-off-by: Pankaj Raghav Suggested-by: Jens Axboe Link: https://lore.kernel.org/r/20220929144141.140077-1-p.raghav@samsung.com [axboe: fixup multi-line comment style] Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ block/blk-mq.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index dc1fa454ae30..4bfc0d504b2d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -850,6 +850,12 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; + /* + * As the requests that require a zone lock are not plugged in the + * first place, directly accessing the plug instead of using + * blk_mq_plug() should not have any consequences during flushing for + * zoned devices. + */ blk_flush_plug(current->plug, false); if (bio_queue_enter(bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index b2b318df4f69..9dd3ec42613f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1215,6 +1215,12 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); + + /* + * As plugging can be enabled for passthrough requests on a zoned + * device, directly accessing the plug instead of using blk_mq_plug() + * should not have any consequences. + */ if (current->plug) blk_add_rq_to_plug(current->plug, rq); else -- cgit