summaryrefslogtreecommitdiff
path: root/mm/vmpressure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmpressure.c')
-rw-r--r--mm/vmpressure.c176
1 files changed, 158 insertions, 18 deletions
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 3fb15c25af87..f5383e43597a 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -22,6 +22,9 @@
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/printk.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/module.h>
#include <linux/vmpressure.h>
/*
@@ -38,7 +41,7 @@
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
/*
* These thresholds are used when we account memory pressure through
@@ -49,6 +52,33 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
static const unsigned int vmpressure_level_med = 60;
static const unsigned int vmpressure_level_critical = 95;
+static unsigned long vmpressure_scale_max = 100;
+module_param_named(vmpressure_scale_max, vmpressure_scale_max,
+ ulong, S_IRUGO | S_IWUSR);
+
+/* vmpressure values >= this will be scaled based on allocstalls */
+static unsigned long allocstall_threshold = 70;
+module_param_named(allocstall_threshold, allocstall_threshold,
+ ulong, S_IRUGO | S_IWUSR);
+
+static struct vmpressure global_vmpressure;
+BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
+
+int vmpressure_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&vmpressure_notifier, nb);
+}
+
+int vmpressure_notifier_unregister(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
+}
+
+void vmpressure_notify(unsigned long pressure)
+{
+ blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
+}
+
/*
* When there are too little pages left to scan, vmpressure() may miss the
* critical pressure as number of pages will be less than "window size".
@@ -75,6 +105,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
return container_of(work, struct vmpressure, work);
}
+#ifdef CONFIG_MEMCG
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
@@ -85,6 +116,12 @@ static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
return NULL;
return memcg_to_vmpressure(memcg);
}
+#else
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+ return NULL;
+}
+#endif
enum vmpressure_levels {
VMPRESSURE_LOW = 0,
@@ -108,7 +145,7 @@ static enum vmpressure_levels vmpressure_level(unsigned long pressure)
return VMPRESSURE_LOW;
}
-static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+static unsigned long vmpressure_calc_pressure(unsigned long scanned,
unsigned long reclaimed)
{
unsigned long scale = scanned + reclaimed;
@@ -135,7 +172,20 @@ out:
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
scanned, reclaimed);
- return vmpressure_level(pressure);
+ return pressure;
+}
+
+static unsigned long vmpressure_account_stall(unsigned long pressure,
+ unsigned long stall, unsigned long scanned)
+{
+ unsigned long scale;
+
+ if (pressure < allocstall_threshold)
+ return pressure;
+
+ scale = ((vmpressure_scale_max - pressure) * stall) / scanned;
+
+ return pressure + scale;
}
struct vmpressure_event {
@@ -149,9 +199,11 @@ static bool vmpressure_event(struct vmpressure *vmpr,
{
struct vmpressure_event *ev;
enum vmpressure_levels level;
+ unsigned long pressure;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
+ pressure = vmpressure_calc_pressure(scanned, reclaimed);
+ level = vmpressure_level(pressure);
mutex_lock(&vmpr->events_lock);
@@ -203,24 +255,13 @@ static void vmpressure_work_fn(struct work_struct *work)
} while ((vmpr = vmpressure_parent(vmpr)));
}
-/**
- * vmpressure() - Account memory pressure through scanned/reclaimed ratio
- * @gfp: reclaimer's gfp mask
- * @memcg: cgroup memory controller handle
- * @scanned: number of pages scanned
- * @reclaimed: number of pages reclaimed
- *
- * This function should be called from the vmscan reclaim path to account
- * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
- * pressure index is then further refined and averaged over time.
- *
- * This function does not return any value.
- */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ BUG_ON(!vmpr);
+
/*
* Here we only want to account pressure that userland is able to
* help us with. For example, suppose that DMA zone is under
@@ -257,6 +298,94 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
schedule_work(&vmpr->work);
}
+void calculate_vmpressure_win(void)
+{
+ long x;
+
+ x = global_page_state(NR_FILE_PAGES) -
+ global_page_state(NR_SHMEM) -
+ total_swapcache_pages() +
+ global_page_state(NR_FREE_PAGES);
+ if (x < 1)
+ x = 1;
+ /*
+ * For low (free + cached), vmpressure window should be
+ * small, and high for higher values of (free + cached).
+ * But it should not be linear as well. This ensures
+ * timely vmpressure notifications when system is under
+ * memory pressure, and optimal number of events when
+ * cached is high. The sqaure root function is empirically
+ * found to serve the purpose.
+ */
+ x = int_sqrt(x);
+ vmpressure_win = x;
+}
+
+void vmpressure_global(gfp_t gfp, unsigned long scanned,
+ unsigned long reclaimed)
+{
+ struct vmpressure *vmpr = &global_vmpressure;
+ unsigned long pressure;
+ unsigned long stall;
+
+ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+ return;
+
+ if (!scanned)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ if (!vmpr->scanned)
+ calculate_vmpressure_win();
+
+ vmpr->scanned += scanned;
+ vmpr->reclaimed += reclaimed;
+
+ if (!current_is_kswapd())
+ vmpr->stall += scanned;
+
+ stall = vmpr->stall;
+ scanned = vmpr->scanned;
+ reclaimed = vmpr->reclaimed;
+ spin_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ vmpr->scanned = 0;
+ vmpr->reclaimed = 0;
+ vmpr->stall = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ pressure = vmpressure_calc_pressure(scanned, reclaimed);
+ pressure = vmpressure_account_stall(pressure, stall, scanned);
+ vmpressure_notify(pressure);
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @scanned: number of pages scanned
+ * @reclaimed: number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ if (!memcg)
+ vmpressure_global(gfp, scanned, reclaimed);
+
+ if (IS_ENABLED(CONFIG_MEMCG))
+ vmpressure_memcg(gfp, memcg, scanned, reclaimed);
+}
+
/**
* vmpressure_prio() - Account memory pressure through reclaimer priority level
* @gfp: reclaimer's gfp mask
@@ -308,6 +437,8 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
struct vmpressure_event *ev;
int level;
+ BUG_ON(!vmpr);
+
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
if (!strcmp(vmpressure_str_levels[level], args))
break;
@@ -347,6 +478,8 @@ void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
+ BUG_ON(!vmpr);
+
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
if (ev->efd != eventfd)
@@ -388,3 +521,10 @@ void vmpressure_cleanup(struct vmpressure *vmpr)
*/
flush_work(&vmpr->work);
}
+
+int vmpressure_global_init(void)
+{
+ vmpressure_init(&global_vmpressure);
+ return 0;
+}
+late_initcall(vmpressure_global_init);