[PATCH -mm 03/14] vmscan: shrink slab on memcg pressure

From: Vladimir Davydov
Date: Sun Sep 21 2014 - 11:18:31 EST


This patch makes direct reclaim path shrink slab not only on global
memory pressure, but also when we reach the user memory limit of a
memcg. To achieve that, it makes shrink_slab() walk over the memcg
hierarchy and run shrinkers marked as memcg-aware on the target memcg
and all its descendants. The memcg to scan is passed in a shrink_control
structure; memcg-unaware shrinkers are still called only on global
memory pressure with memcg=NULL. It is up to the shrinker how to
organize the objects it is responsible for to achieve per-memcg reclaim.

Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 22 +++++++++++
include/linux/shrinker.h | 10 ++++-
mm/memcontrol.c | 46 ++++++++++++++++++++++-
mm/vmscan.c | 87 ++++++++++++++++++++++++++++++++++----------
4 files changed, 143 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 19df5d857411..c4e64d0e318d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -68,6 +68,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);

+unsigned long mem_cgroup_zone_reclaimable_pages(struct zone *zone,
+ struct mem_cgroup *memcg);
+
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg);
bool task_in_mem_cgroup(struct task_struct *task,
@@ -251,6 +254,12 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
return &zone->lruvec;
}

+static inline unsigned long mem_cgroup_zone_reclaimable_pages(struct zone *zone,
+ struct mem_cgroup *)
+{
+ return 0;
+}
+
static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
return NULL;
@@ -421,6 +430,9 @@ static inline bool memcg_kmem_enabled(void)
return static_key_false(&memcg_kmem_enabled_key);
}

+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg);
+
/*
* In general, we'll do everything in our power to not incur in any overhead
* for non-memcg users for the kmem functions. Not even a function call, if we
@@ -549,6 +561,16 @@ static inline bool memcg_kmem_enabled(void)
return false;
}

+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return false;
+}
+
+static inline bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg)
+{
+ return false;
+}
+
static inline bool
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
{
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 68c097077ef0..ab79b174bfbe 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,8 +20,15 @@ struct shrink_control {

/* shrink from these nodes */
nodemask_t nodes_to_scan;
+
+ /* shrink from this memory cgroup hierarchy (if not NULL) */
+ struct mem_cgroup *target_mem_cgroup;
+
/* current node being shrunk (for NUMA aware shrinkers) */
int nid;
+
+ /* current memcg being shrunk (for memcg aware shrinkers) */
+ struct mem_cgroup *memcg;
};

#define SHRINK_STOP (~0UL)
@@ -63,7 +70,8 @@ struct shrinker {
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */

/* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_MEMCG_AWARE (1 << 1)

extern int register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9431024e490c..7361bd8b720a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -391,7 +391,7 @@ static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}

-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
@@ -1411,6 +1411,31 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}

+unsigned long mem_cgroup_zone_reclaimable_pages(struct zone *zone,
+ struct mem_cgroup *memcg)
+{
+ unsigned long nr = 0;
+ unsigned int lru_mask;
+ struct mem_cgroup *iter;
+
+ lru_mask = LRU_ALL_FILE;
+ if (get_nr_swap_pages() > 0)
+ lru_mask |= LRU_ALL_ANON;
+
+ iter = memcg;
+ do {
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list lru;
+
+ mz = mem_cgroup_zone_zoneinfo(memcg, zone);
+ for_each_lru(lru)
+ if (BIT(lru) & lru_mask)
+ nr += mz->lru_size[lru];
+ } while ((iter = mem_cgroup_iter(memcg, iter, NULL)) != NULL);
+
+ return nr;
+}
+
/*
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
@@ -2786,6 +2811,25 @@ static DEFINE_MUTEX(memcg_slab_mutex);

static DEFINE_MUTEX(activate_kmem_mutex);

+/*
+ * Returns true if the given cgroup or any of its descendants has kmem
+ * accounting enabled.
+ */
+bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ iter = memcg;
+ do {
+ if (memcg_kmem_is_active(iter)) {
+ mem_cgroup_iter_break(memcg, iter);
+ return true;
+ }
+ } while ((iter = mem_cgroup_iter(memcg, iter, NULL)) != NULL);
+
+ return false;
+}
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b672e2c6becc..041d0e41a5a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -340,6 +340,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
return freed;
}

+static unsigned long
+run_shrinker(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
+ return shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
+ }
+ return freed;
+}
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -381,20 +401,34 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}

list_for_each_entry(shrinker, &shrinker_list, list) {
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
- shrinkctl->nid = 0;
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ /*
+ * Call memcg-unaware shrinkers only on global pressure.
+ */
+ if (!(shrinker->flags & SHRINKER_MEMCG_AWARE)) {
+ if (!shrinkctl->target_mem_cgroup) {
+ shrinkctl->memcg = NULL;
+ freed += run_shrinker(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
+ }
continue;
}

- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (node_online(shrinkctl->nid))
- freed += shrink_slab_node(shrinkctl, shrinker,
+ /*
+ * For memcg-aware shrinkers iterate over the target memcg
+ * hierarchy and run the shrinker on each kmem-active memcg
+ * found in the hierarchy.
+ */
+ shrinkctl->memcg = shrinkctl->target_mem_cgroup;
+ do {
+ if (!shrinkctl->memcg ||
+ memcg_kmem_is_active(shrinkctl->memcg))
+ freed += run_shrinker(shrinkctl, shrinker,
nr_pages_scanned, lru_pages);
-
- }
+ } while ((shrinkctl->memcg =
+ mem_cgroup_iter(shrinkctl->target_mem_cgroup,
+ shrinkctl->memcg, NULL)) != NULL);
}
+
up_read(&shrinker_rwsem);
out:
cond_resched();
@@ -2381,6 +2415,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
gfp_t orig_mask;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
+ .target_mem_cgroup = sc->target_mem_cgroup,
};
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
bool reclaimable = false;
@@ -2400,17 +2435,22 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
+
+ if (global_reclaim(sc) &&
+ !cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += global_reclaim(sc) ?
+ zone_reclaimable_pages(zone) :
+ mem_cgroup_zone_reclaimable_pages(zone,
+ sc->target_mem_cgroup);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (global_reclaim(sc)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
@@ -2458,12 +2498,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
}

/*
- * Don't shrink slabs when reclaiming memory from over limit cgroups
- * but do shrink slab at least once when aborting reclaim for
- * compaction to avoid unevenly scanning file/anon LRU pages over slab
- * pages.
+ * Shrink slabs at least once when aborting reclaim for compaction
+ * to avoid unevenly scanning file/anon LRU pages over slab pages.
*/
- if (global_reclaim(sc)) {
+ if (global_reclaim(sc) ||
+ memcg_kmem_is_active_subtree(sc->target_mem_cgroup)) {
shrink_slab(&shrink, sc->nr_scanned, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2765,6 +2804,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
int nid;
+ struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2785,6 +2825,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

zonelist = NODE_DATA(nid)->node_zonelists;

+ lockdep_set_current_reclaim_state(sc.gfp_mask);
+ reclaim_state.reclaimed_slab = 0;
+ current->reclaim_state = &reclaim_state;
+
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
@@ -2793,6 +2837,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);

+ current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return nr_reclaimed;
}
#endif
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/