diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf9247301e3c..95a19d1fbedf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -565,26 +565,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +#define BFQ_LIMIT_INLINE_DEPTH 16 + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_entity *entity = &bfqq->entity; + struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; + struct bfq_entity **entities = inline_entities; + int depth, level; + int class_idx = bfqq->ioprio_class - 1; + struct bfq_sched_data *sched_data; + unsigned long wsum; + bool ret = false; + + if (!entity->on_st_or_in_serv) + return false; + + /* +1 for bfqq entity, root cgroup not included */ + depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; + if (depth > BFQ_LIMIT_INLINE_DEPTH) { + entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO); + if (!entities) + return false; + } + + spin_lock_irq(&bfqd->lock); + sched_data = entity->sched_data; + /* Gather our ancestors as we need to traverse them in reverse order */ + level = 0; + for_each_entity(entity) { + /* + * If at some level entity is not even active, allow request + * queueing so that BFQ knows there's work to do and activate + * entities. + */ + if (!entity->on_st_or_in_serv) + goto out; + /* Uh, more parents than cgroup subsystem thinks? */ + if (WARN_ON_ONCE(level >= depth)) + break; + entities[level++] = entity; + } + WARN_ON_ONCE(level != depth); + for (level--; level >= 0; level--) { + entity = entities[level]; + if (level > 0) { + wsum = bfq_entity_service_tree(entity)->wsum; + } else { + int i; + /* + * For bfqq itself we take into account service trees + * of all higher priority classes and multiply their + * weights so that low prio queue from higher class + * gets more requests than high prio queue from lower + * class. + */ + wsum = 0; + for (i = 0; i <= class_idx; i++) { + wsum = wsum * IOPRIO_BE_NR + + sched_data->service_tree[i].wsum; + } + } + limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); + if (entity->allocated >= limit) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "too many requests: allocated %d limit %d level %d", + entity->allocated, limit, level); + ret = true; + break; + } + } +out: + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + return ret; +} +#else +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + return false; +} +#endif + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. + * + * Also if a bfq queue or its parent cgroup consume more tags than would be + * appropriate for their weight, we trim the available tag depth to 1. This + * avoids a situation where one cgroup can starve another cgroup from tags and + * thus block service differentiation among cgroups. Note that because the + * queue / cgroup already has many requests allocated and queued, this does not + * significantly affect service guarantees coming from the BFQ scheduling + * algorithm. */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(blk_mq_sched_get_icq(data->q)); + struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; + int depth; + unsigned limit = data->q->nr_requests; - if (op_is_sync(op) && !op_is_write(op)) - return; + /* Sync reads have full depth available */ + if (op_is_sync(op) && !op_is_write(op)) { + depth = 0; + } else { + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + limit = (limit * depth) >> bfqd->full_depth_shift; + } - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + /* + * Does queue (or any parent entity) exceed number of requests that + * should be available to it? Heavily limit depth so that it cannot + * consume more available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) + depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); + __func__, bfqd->wr_busy_queues, op_is_sync(op), depth); + if (depth) + data->shallow_depth = depth; } static struct bfq_queue * @@ -6853,10 +6961,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int i, j, min_shallow = UINT_MAX; unsigned int depth = 1U << bt->sb.shift; bfqd->full_depth_shift = bt->sb.shift; @@ -6890,22 +6996,15 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - min_shallow = min(min_shallow, bfqd->word_depths[i][j]); - - return min_shallow; } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)