mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-22 22:51:37 +00:00
writeback: make backing_dev_info host cgroup-specific bdi_writebacks
For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: kbuild test robot <fengguang.wu@intel.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
parent
89e9b9e07a
commit
52ebea749a
11 changed files with 698 additions and 11 deletions
|
@ -2,8 +2,11 @@
|
|||
#define __LINUX_BACKING_DEV_DEFS_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
#include <linux/percpu-refcount.h>
|
||||
#include <linux/flex_proportions.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
@ -37,10 +40,43 @@ enum wb_stat_item {
|
|||
|
||||
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
|
||||
|
||||
/*
|
||||
* For cgroup writeback, multiple wb's may map to the same blkcg. Those
|
||||
* wb's can operate mostly independently but should share the congested
|
||||
* state. To facilitate such sharing, the congested state is tracked using
|
||||
* the following struct which is created on demand, indexed by blkcg ID on
|
||||
* its bdi, and refcounted.
|
||||
*/
|
||||
struct bdi_writeback_congested {
|
||||
unsigned long state; /* WB_[a]sync_congested flags */
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct backing_dev_info *bdi; /* the associated bdi */
|
||||
atomic_t refcnt; /* nr of attached wb's and blkg */
|
||||
int blkcg_id; /* ID of the associated blkcg */
|
||||
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Each wb (bdi_writeback) can perform writeback operations, is measured
|
||||
* and throttled, independently. Without cgroup writeback, each bdi
|
||||
* (bdi_writeback) is served by its embedded bdi->wb.
|
||||
*
|
||||
* On the default hierarchy, blkcg implicitly enables memcg. This allows
|
||||
* using memcg's page ownership for attributing writeback IOs, and every
|
||||
* memcg - blkcg combination can be served by its own wb by assigning a
|
||||
* dedicated wb to each memcg, which enables isolation across different
|
||||
* cgroups and propagation of IO back pressure down from the IO layer upto
|
||||
* the tasks which are generating the dirty pages to be written back.
|
||||
*
|
||||
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
|
||||
* refcounted with the number of inodes attached to it, and pins the memcg
|
||||
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
|
||||
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
|
||||
* is tested for blkcg after lookup and removed from index on mismatch so
|
||||
* that a new wb for the combination can be created.
|
||||
*/
|
||||
struct bdi_writeback {
|
||||
struct backing_dev_info *bdi; /* our parent bdi */
|
||||
|
||||
|
@ -78,6 +114,19 @@ struct bdi_writeback {
|
|||
spinlock_t work_lock; /* protects work_list & dwork scheduling */
|
||||
struct list_head work_list;
|
||||
struct delayed_work dwork; /* work item used for writeback */
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct percpu_ref refcnt; /* used only for !root wb's */
|
||||
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
|
||||
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
|
||||
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
|
||||
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
|
||||
|
||||
union {
|
||||
struct work_struct release_work;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
#endif
|
||||
};
|
||||
|
||||
struct backing_dev_info {
|
||||
|
@ -92,9 +141,13 @@ struct backing_dev_info {
|
|||
unsigned int min_ratio;
|
||||
unsigned int max_ratio, max_prop_frac;
|
||||
|
||||
struct bdi_writeback wb; /* default writeback info for this bdi */
|
||||
struct bdi_writeback_congested wb_congested;
|
||||
|
||||
struct bdi_writeback wb; /* the root writeback info for this bdi */
|
||||
struct bdi_writeback_congested wb_congested; /* its congested state */
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
|
||||
struct rb_root cgwb_congested_tree; /* their congested states */
|
||||
atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
|
||||
#endif
|
||||
struct device *dev;
|
||||
|
||||
struct timer_list laptop_mode_wb_timer;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue