From 36c26128b8983d9af58266c36f23209064e22108 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: mm/vmscan: reclaim only affects managed_zones As mentioned in commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") , reclaim only affects managed_zones. Let's adjust the code and comment accordingly. Link: https://lkml.kernel.org/r/20220327024101.10378-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: "Huang, Ying" Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1678802e03e7..cd9f222a3e78 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1031,7 +1031,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; reclaimable += zone_reclaimable_pages(zone); @@ -3912,7 +3912,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) } /* - * If a node has no populated zone within highest_zoneidx, it does not + * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ -- cgit From 8b3a899abe15e68b8b82ff96c2f4e8c9a37874a0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: sc->reclaim_idx must be a valid zone index lruvec_lru_size() is only used in get_scan_count(), so the only possible zone_idx is sc->reclaim_idx. Since sc->reclaim_idx is ensured to be a valid zone idex, we can remove the extra check for zone iteration. Link: https://lkml.kernel.org/r/20220317234624.23358-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index cd9f222a3e78..f6570f334a05 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -587,7 +587,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use - * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -595,7 +595,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long size = 0; int zid; - for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; if (!managed_zone(zone)) -- cgit From 02e458d8d04ed952a0451c094e6498c6829c28e8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: remove obsolete comment in get_scan_count Since commit 1431d4d11abb ("mm: base LRU balancing on an explicit cost model"), the relative value of each set of LRU lists is based on cost model instead of rotated/scanned ratio. Cleanup the relevant comment. Link: https://lkml.kernel.org/r/20220409030245.61211-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f6570f334a05..0e2f253949fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2646,9 +2646,7 @@ enum scan_balance { /* * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined - * by looking at the fraction of the pages scanned we did rotate back - * onto the active list instead of evict. + * scanned. * * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan -- cgit From 5829f7dbae4158f9c946fac42760de848a8f1695 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: fix comment for current_may_throttle Since commit 6d6435811c19 ("remove bdi_congested() and wb_congested() and related functions"), there is no congested backing device check anymore. Correct the comment accordingly. [akpm@linux-foundation.org: tweak grammar] Link: https://lkml.kernel.org/r/20220414120202.30082-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e2f253949fe..c5bfbb74fde1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2297,10 +2297,9 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, } /* - * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. - * In that case we should only throttle if the backing device it is - * writing to is congested. In other cases it is safe to throttle. + * If a kernel thread (such as nfsd for loop-back mounts) services a backing + * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case + * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { -- cgit From b2cb6826b6df2bdf91ae4406fd2ef97da7a9cd35 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:04 -0700 Subject: mm/vmscan: fix comment for isolate_lru_pages Since commit 791b48b64232 ("mm: vmscan: scan until it finds eligible pages"), splicing any skipped pages to the tail of the LRU list won't put the system at risk of premature OOM but will waste lots of cpu cycles. Correct the comment accordingly. Link: https://lkml.kernel.org/r/20220416025231.8082-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c5bfbb74fde1..6e255e2867d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2117,8 +2117,8 @@ move: * Splice any skipped pages to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX - * scanning would soon rescan the same pages to skip and put the - * system at risk of premature OOM. + * scanning would soon rescan the same pages to skip and waste lots + * of cpu cycles. */ if (!list_empty(&pages_skipped)) { int zid; -- cgit From 014bb1de4fc17d54907d54418126a9a9736f4aff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm: create new mm/swap.h header file Patch series "MM changes to improve swap-over-NFS support". Assorted improvements for swap-via-filesystem. This is a resend of these patches, rebased on current HEAD. The only substantial changes is that swap_dirty_folio has replaced swap_set_page_dirty. Currently swap-via-fs (SWP_FS_OPS) doesn't work for any filesystem. It has previously worked for NFS but that broke a few releases back. This series changes to use a new ->swap_rw rather than ->readpage and ->direct_IO. It also makes other improvements. There is a companion series already in linux-next which fixes various issues with NFS. Once both series land, a final patch is needed which changes NFS over to use ->swap_rw. This patch (of 10): Many functions declared in include/linux/swap.h are only used within mm/ Create a new "mm/swap.h" and move some of these declarations there. Remove the redundant 'extern' from the function declarations. [akpm@linux-foundation.org: mm/memory-failure.c needs mm/swap.h] Link: https://lkml.kernel.org/r/164859751830.29473.5309689752169286816.stgit@noble.brown Link: https://lkml.kernel.org/r/164859778120.29473.11725907882296224053.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Trond Myklebust Cc: Hugh Dickins Cc: Mel Gorman Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6e255e2867d2..b1b91d5b9f49 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,6 +59,7 @@ #include #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include -- cgit From d791ea676b66489ef6dabd04cd655f5c77426e40 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: reclaim mustn't enter FS for SWP_FS_OPS swap-space If swap-out is using filesystem operations (SWP_FS_OPS), then it is not safe to enter the FS for reclaim. So only down-grade the requirement for swap pages to __GFP_IO after checking that SWP_FS_OPS are not being used. This makes the calculation of "may_enter_fs" slightly more complex, so move it into a separate function. with that done, there is little value in maintaining the bool variable any more. So replace the may_enter_fs variable with a may_enter_fs() function. This removes any risk for the variable becoming out-of-date. Link: https://lkml.kernel.org/r/164859778124.29473.16176717935781721855.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b1b91d5b9f49..95ebcb5b3e12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1502,6 +1502,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return nr_succeeded; } +static bool may_enter_fs(struct page *page, gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_FS) + return true; + if (!PageSwapCache(page) || !(gfp_mask & __GFP_IO)) + return false; + /* + * We can "enter_fs" for swap-cache with only __GFP_IO + * providing this isn't SWP_FS_OPS. + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + return !data_race(page_swap_flags(page) & SWP_FS_OPS); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1528,7 +1544,7 @@ retry: struct page *page; struct folio *folio; enum page_references references = PAGEREF_RECLAIM; - bool dirty, writeback, may_enter_fs; + bool dirty, writeback; unsigned int nr_pages; cond_resched(); @@ -1553,9 +1569,6 @@ retry: if (!sc->may_unmap && page_mapped(page)) goto keep_locked; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1598,7 +1611,7 @@ retry: * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * - * Require may_enter_fs because we would wait on fs, which + * Require may_enter_fs() because we would wait on fs, which * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off @@ -1630,7 +1643,7 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || - !PageReclaim(page) || !may_enter_fs) { + !PageReclaim(page) || !may_enter_fs(page, sc->gfp_mask)) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then @@ -1720,8 +1733,6 @@ retry: goto activate_locked_split; } - may_enter_fs = true; - /* Adding to swap updated mapping */ mapping = page_mapping(page); } @@ -1792,7 +1803,7 @@ retry: if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs) + if (!may_enter_fs(page, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; -- cgit From 2282679fb20bf036a714ed49fadd0230c278a203 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: mm: submit multipage write for SWP_FS_OPS swap-space swap_writepage() is given one page at a time, but may be called repeatedly in succession. For block-device swapspace, the blk_plug functionality allows the multiple pages to be combined together at lower layers. That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS are single page reads. With this patch we pass a pointer-to-pointer via the wbc. swap_writepage can store state between calls - much like the pointer passed explicitly to swap_readpage. After calling swap_writepage() some number of times, the state will be passed to swap_write_unplug() which can submit the combined request. Link: https://lkml.kernel.org/r/164859778128.29473.5191868522654408537.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 95ebcb5b3e12..a9761b04564c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1156,7 +1156,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct folio *folio, struct address_space *mapping) +static pageout_t pageout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug) { /* * If the folio is dirty, only perform writeback if that write @@ -1201,6 +1202,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping) .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, + .swap_plug = plug, }; folio_set_reclaim(folio); @@ -1533,6 +1535,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; + struct swap_iocb *plug = NULL; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1814,7 +1817,7 @@ retry: * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping)) { + switch (pageout(folio, mapping, &plug)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -1968,6 +1971,8 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + if (plug) + swap_write_unplug(plug); return nr_reclaimed; } -- cgit From d8ff6fde8e88d801b62328883680c202510ed518 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm/vmscan: take min_slab_pages into account when try to call shrink_node Since commit 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()"), slab reclaim and lru page reclaim are done together in the shrink_node. So we should take min_slab_pages into account when try to call shrink_node. Link: https://lkml.kernel.org/r/20220425112118.20924-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Huang Ying Cc: Christoph Hellwig Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index a9761b04564c..5ac0a71dc0df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4713,7 +4713,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(p, &sc.reclaim_state); - if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. -- cgit From 048f6e1a427ee9cddf62f9b3766372c69846fa4f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: not necessary to re-init the list for each iteration node_page_list is defined with LIST_HEAD and be cleaned until list_empty. So it is not necessary to re-init it again. [akpm@linux-foundation.org: remove unneeded braces] Link: https://lkml.kernel.org/r/20220426021743.21007-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ac0a71dc0df..726f5ce366da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2553,10 +2553,8 @@ unsigned long reclaim_pages(struct list_head *page_list) while (!list_empty(page_list)) { page = lru_to_page(page_list); - if (nid == NUMA_NO_NODE) { + if (nid == NUMA_NO_NODE) nid = page_to_nid(page); - INIT_LIST_HEAD(&node_page_list); - } if (nid == page_to_nid(page)) { ClearPageActive(page); -- cgit From 32a331a72f3eec30f65fd929aeb4dfc514eca28f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: add a comment about MADV_FREE pages check in folio_check_dirty_writeback Patch series "A few cleanup and fixup patches for vmscan This series contains a few patches to remove obsolete comment, introduce helper to remove duplicated code and so no. Also we take all base pages of THP into account in rare race condition. More details can be found in the respective changelogs. This patch (of 6): The MADV_FREE pages check in folio_check_dirty_writeback is a bit hard to follow. Add a comment to make the code clear. Link: https://lkml.kernel.org/r/20220425111232.23182-2-linmiaohe@huawei.com Suggested-by: Huang, Ying Signed-off-by: Miaohe Lin Reviewed-by: Christoph Hellwig Reviewed-by: Oscar Salvador Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 726f5ce366da..4ea23807d23f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1435,7 +1435,10 @@ static void folio_check_dirty_writeback(struct folio *folio, /* * Anonymous pages are not handled by flushers and must be written - * from reclaim context. Do not stall reclaim based on them + * from reclaim context. Do not stall reclaim based on them. + * MADV_FREE anonymous pages are put into inactive file list too. + * They could be mistakenly treated as file lru. So further anon + * test is needed. */ if (!folio_is_file_lru(folio) || (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { -- cgit From 1fe47c0beb2df2739b07b8e051425b6400abce5b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:22:59 -0700 Subject: mm/vmscan: introduce helper function reclaim_page_list() Introduce helper function reclaim_page_list() to eliminate the duplicated code of doing shrink_page_list() and putback_lru_page. Also we can separate node reclaim from node page list operation this way. No functional change intended. Link: https://lkml.kernel.org/r/20220425111232.23182-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4ea23807d23f..68d4004e0397 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2536,14 +2536,12 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -unsigned long reclaim_pages(struct list_head *page_list) +static unsigned int reclaim_page_list(struct list_head *page_list, + struct pglist_data *pgdat) { - int nid = NUMA_NO_NODE; - unsigned int nr_reclaimed = 0; - LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; - struct page *page; - unsigned int noreclaim_flag; + unsigned int nr_reclaimed; + struct folio *folio; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, @@ -2552,6 +2550,24 @@ unsigned long reclaim_pages(struct list_head *page_list) .no_demotion = 1, }; + nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(page_list)) { + folio = lru_to_folio(page_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_reclaimed; +} + +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = NUMA_NO_NODE; + unsigned int nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct page *page; + unsigned int noreclaim_flag; + noreclaim_flag = memalloc_noreclaim_save(); while (!list_empty(page_list)) { @@ -2565,28 +2581,12 @@ unsigned long reclaim_pages(struct list_head *page_list) continue; } - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); nid = NUMA_NO_NODE; } - if (!list_empty(&node_page_list)) { - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - } + if (!list_empty(&node_page_list)) + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); -- cgit From 9aafcffc18785fcdd9295640eb2ed927960b31a1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: take all base pages of THP into account when race with speculative reference If the page has buffers, shrink_page_list will try to free the buffer mappings associated with the page and try to free the page as well. In the rare race with speculative reference, the page will be freed shortly by speculative reference. But nr_reclaimed is not incremented correctly when we come across the THP. We need to account all the base pages in this case. Link: https://lkml.kernel.org/r/20220425111232.23182-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 68d4004e0397..61688f4a9d1e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1884,7 +1884,7 @@ retry: * increment nr_reclaimed here (and * leave it off the LRU). */ - nr_reclaimed++; + nr_reclaimed += nr_pages; continue; } } -- cgit From 4355e4b265ccb55dd6625b82f0e2016f42f2956c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: remove obsolete comment in kswapd_run Since commit 6b700b5b3c59 ("mm/vmscan.c: remove cpu online notification for now"), cpu online notification is removed. So kswapd won't move to proper cpus if cpus are hot-added. Remove this obsolete comment. Link: https://lkml.kernel.org/r/20220425111232.23182-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 61688f4a9d1e..5c20efe0c82a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4567,7 +4567,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ void kswapd_run(int nid) { -- cgit From f19a27e399c4354b91d608dd77f33877f613224a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: use helper folio_is_file_lru() Use helper folio_is_file_lru() to check whether folio is file lru. Minor readability improvement. [linmiaohe@huawei.com: use folio_is_file_lru()] Link: https://lkml.kernel.org/r/20220428105802.21389-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220425111232.23182-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Christoph Hellwig Cc: Huang, Ying Cc: Joonsoo Kim Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c20efe0c82a..9ff3cc2a941a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1414,14 +1414,14 @@ static enum page_references folio_check_references(struct folio *folio, /* * Activate file-backed executable folios after first usage. */ - if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio)) + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ - if (referenced_folio && !folio_test_swapbacked(folio)) + if (referenced_folio && folio_is_file_lru(folio)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; -- cgit From 1ae65e2749b0a3d236fc17d0eca5ea5f6f2c0032 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: filter empty page_list at the beginning node_page_list would always be !empty on finishing the loop, except page_list is empty. Let's handle empty page_list before doing any real work including touching PF_MEMALLOC flag. Link: https://lkml.kernel.org/r/20220429014426.29223-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ff3cc2a941a..43883ff89c1a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2568,9 +2568,12 @@ unsigned long reclaim_pages(struct list_head *page_list) struct page *page; unsigned int noreclaim_flag; + if (list_empty(page_list)) + return nr_reclaimed; + noreclaim_flag = memalloc_noreclaim_save(); - while (!list_empty(page_list)) { + do { page = lru_to_page(page_list); if (nid == NUMA_NO_NODE) nid = page_to_nid(page); @@ -2583,10 +2586,9 @@ unsigned long reclaim_pages(struct list_head *page_list) nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); nid = NUMA_NO_NODE; - } + } while (!list_empty(page_list)); - if (!list_empty(&node_page_list)) - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); -- cgit From ed657e5568c5fc877c517b654d65fbdaeb628539 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 12 May 2022 20:23:00 -0700 Subject: mm/vmscan: don't use NUMA_NO_NODE as indicator of page on different node Now we are sure there is at least one page on page_list, so it is safe to get the nid of it. This means it is not necessary to use NUMA_NO_NODE as an indicator for the beginning of iteration or a page on different node. Link: https://lkml.kernel.org/r/20220429014426.29223-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 43883ff89c1a..9cfce1449af4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2562,7 +2562,7 @@ static unsigned int reclaim_page_list(struct list_head *page_list, unsigned long reclaim_pages(struct list_head *page_list) { - int nid = NUMA_NO_NODE; + int nid; unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct page *page; @@ -2573,10 +2573,9 @@ unsigned long reclaim_pages(struct list_head *page_list) noreclaim_flag = memalloc_noreclaim_save(); + nid = page_to_nid(lru_to_page(page_list)); do { page = lru_to_page(page_list); - if (nid == NUMA_NO_NODE) - nid = page_to_nid(page); if (nid == page_to_nid(page)) { ClearPageActive(page); @@ -2585,7 +2584,7 @@ unsigned long reclaim_pages(struct list_head *page_list) } nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); - nid = NUMA_NO_NODE; + nid = page_to_nid(lru_to_page(page_list)); } while (!list_empty(page_list)); nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); -- cgit From 1bee2c1677bcb57dadd374cafb59be86ad1a1a82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: vmscan: use folio_mapped() in shrink_page_list() Remove some legacy function calls. Link: https://lkml.kernel.org/r/20220504182857.4013401-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9cfce1449af4..e25bca89c3c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,7 +1572,7 @@ retry: if (unlikely(!page_evictable(page))) goto activate_locked; - if (!sc->may_unmap && page_mapped(page)) + if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; /* @@ -1761,21 +1761,21 @@ retry: } /* - * The page is mapped into the page tables of one or more + * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page)) { + if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; - bool was_swapbacked = PageSwapBacked(page); + bool was_swapbacked = folio_test_swapbacked(folio); - if (PageTransHuge(page) && - thp_order(page) >= HPAGE_PMD_ORDER) + if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(folio, flags); - if (page_mapped(page)) { + if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; - if (!was_swapbacked && PageSwapBacked(page)) + if (!was_swapbacked && + folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } -- cgit From d33e4e1412c8b618f5f2f251ab9ddcfdf9f4adf3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert the writeback handling in shrink_page_list() to folios Slightly more efficient due to fewer calls to compound_head(). Link: https://lkml.kernel.org/r/20220504182857.4013401-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 78 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index e25bca89c3c1..2837e7e3677c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1598,40 +1598,42 @@ retry: stat->nr_congested += nr_pages; /* - * If a page at the tail of the LRU is under writeback, there + * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of pages - * under writeback and this page is both under writeback and - * PageReclaim then it indicates that pages are being queued - * for IO but are being recycled through the LRU before the - * IO can complete. Waiting on the page itself risks an - * indefinite stall if it is impossible to writeback the - * page due to IO error or disconnected storage so instead - * note that the LRU is being scanned too quickly and the - * caller can stall after page list has been processed. + * 1) If reclaim is encountering an excessive number of folios + * under writeback and this folio is both under + * writeback and has the reclaim flag set then it + * indicates that folios are being queued for I/O but + * are being recycled through the LRU before the I/O + * can complete. Waiting on the folio itself risks an + * indefinite stall if it is impossible to writeback + * the folio due to I/O error or disconnected storage + * so instead note that the LRU is being scanned too + * quickly and the caller can stall after the folio + * list has been processed. * - * 2) Global or new memcg reclaim encounters a page that is + * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the page for immediate + * not to fs). In this case mark the folio for immediate * reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which - * may not have submitted IO yet. And the loop driver might - * enter reclaim, and deadlock if it waits on a page for + * may not have submitted I/O yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) Legacy memcg encounters a page that is already marked - * PageReclaim. memcg does not have any dirty pages + * 3) Legacy memcg encounters a folio that already has the + * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many - * pages are in writeback and there is nothing else to + * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * - * In cases 1) and 2) we activate the pages to get them out of - * the way while we continue scanning for clean pages on the + * In cases 1) and 2) we activate the folios to get them out of + * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. @@ -1639,38 +1641,42 @@ retry: * memory pressure on the cache working set any longer than it * takes to write them to disk. */ - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { /* Case 1 above */ if (current_is_kswapd() && - PageReclaim(page) && + folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || - !PageReclaim(page) || !may_enter_fs(page, sc->gfp_mask)) { + !folio_test_reclaim(folio) || + !may_enter_fs(page, sc->gfp_mask)) { /* - * This is slightly racy - end_page_writeback() - * might have just cleared PageReclaim, then - * setting PageReclaim here end up interpreted - * as PageReadahead - but that does not matter - * enough to care. What we do want is for this - * page to have PageReclaim set next time memcg - * reclaim reaches the tests above, so it will - * then wait_on_page_writeback() to avoid OOM; - * and it's also appropriate in global reclaim. + * This is slightly racy - + * folio_end_writeback() might have just + * cleared the reclaim flag, then setting + * reclaim here ends up interpreted as + * the readahead flag - but that does + * not matter enough to care. What we + * do want is for this folio to have + * the reclaim flag set next time memcg + * reclaim reaches the tests above, so + * it will then folio_wait_writeback() + * to avoid OOM; and it's also appropriate + * in global reclaim. */ - SetPageReclaim(page); + folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { - unlock_page(page); - wait_on_page_writeback(page); - /* then go back and try same page again */ - list_add_tail(&page->lru, page_list); + folio_unlock(folio); + folio_wait_writeback(folio); + /* then go back and try same folio again */ + list_add_tail(&folio->lru, page_list); continue; } } -- cgit From 09c02e56327bdaf9cdbb2742a35fb8c6a6f9a6c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: swap: convert add_to_swap() to take a folio The only caller already has a folio available, so this saves a conversion. Also convert the return type to boolean. Link: https://lkml.kernel.org/r/20220504182857.4013401-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2837e7e3677c..f2fc5aa38cb8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1731,8 +1731,8 @@ retry: page_list)) goto activate_locked; } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) + if (!add_to_swap(folio)) { + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, @@ -1741,7 +1741,7 @@ retry: #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); #endif - if (!add_to_swap(page)) + if (!add_to_swap(folio)) goto activate_locked_split; } -- cgit From 49bd2bf9679f4a5b30236546fca61e66b989ce96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert dirty page handling to folios Mostly this just eliminates calls to compound_head(), but NR_VMSCAN_IMMEDIATE was being incremented by 1 instead of by nr_pages. Link: https://lkml.kernel.org/r/20220504182857.4013401-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f2fc5aa38cb8..773b88c8e7f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1787,28 +1787,31 @@ retry: } } - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { /* - * Only kswapd can writeback filesystem pages + * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid - * injecting inefficient single-page IO into + * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only - * write pages when we've encountered many - * dirty pages, and when we've already scanned - * the rest of the LRU for clean pages and see - * the same dirty pages again (PageReclaim). + * write folios when we've encountered many + * dirty folios, and when we've already scanned + * the rest of the LRU for clean folios and see + * the same dirty folios again (with the reclaim + * flag set). */ - if (page_is_file_lru(page) && - (!current_is_kswapd() || !PageReclaim(page) || + if (folio_is_file_lru(folio) && + (!current_is_kswapd() || + !folio_test_reclaim(folio) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated + * Similar in principle to deactivate_page() + * except we already have the folio isolated * and know it's dirty */ - inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); + node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, + nr_pages); + folio_set_reclaim(folio); goto activate_locked; } @@ -1821,8 +1824,8 @@ retry: goto keep_locked; /* - * Page is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after IO + * Folio is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); @@ -1834,23 +1837,24 @@ retry: case PAGE_SUCCESS: stat->nr_pageout += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) goto keep; - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. + * ahead and try to reclaim the folio. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || + folio_test_writeback(folio)) goto keep_locked; - mapping = page_mapping(page); + mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: - ; /* try to free the page below */ + ; /* try to free the folio below */ } } -- cgit From 0a36111c8c20b2edc7c83f084bdba2be9d42c1e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: vmscan: convert page buffer handling to use folios This mostly just removes calls to compound_head() although nr_reclaimed should be incremented by the number of pages, not just 1. Link: https://lkml.kernel.org/r/20220504182857.4013401-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 773b88c8e7f8..00b985caf40d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1859,38 +1859,40 @@ retry: } /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. + * If the folio has buffers, try to free the buffer + * mappings associated with this folio. If we succeed + * we try to free the folio as well. * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. + * We do this even if the folio is dirty. + * filemap_release_folio() does not perform I/O, but it + * is possible for a folio to have the dirty flag set, + * but it is actually clean (all its buffers are clean). + * This happens if the buffers were written out directly, + * with submit_bh(). ext3 will do this, as well as + * the blockdev mapping. filemap_release_folio() will + * discover that cleanness and will drop the buffers + * and mark the folio clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_cleanup_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. + * Rarely, folios can have buffers and no ->mapping. + * These are the folios which were not successfully + * invalidated in truncate_cleanup_folio(). We try to + * drop those buffers here and if that worked, and the + * folio is no longer mapped into process address space + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) + if (folio_has_private(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) + if (!mapping && folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free - * this page shortly, so we may + * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ -- cgit From 64daa5d818ae3430f0785206b0af13ef528cb9ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: convert lazy freeing to folios Remove a hidden call to compound_head(), and account nr_pages instead of a single page. This matches the code in lru_lazyfree_fn() that accounts nr_pages to PGLAZYFREE. Link: https://lkml.kernel.org/r/20220504182857.4013401-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 00b985caf40d..972557e2fb92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1902,20 +1902,20 @@ retry: } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ - if (!page_ref_freeze(page, 1)) + if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* - * The page has only one reference left, which is + * The folio has only one reference left, which is * from the isolation. After the caller puts the - * page back on lru and drops the reference, the - * page will be freed anyway. It doesn't matter - * which lru it goes. So we don't bother checking - * PageDirty here. + * folio back on the lru and drops the reference, the + * folio will be freed anyway. It doesn't matter + * which lru it goes on. So we don't bother checking + * the dirty flag here. */ - count_vm_event(PGLAZYFREED); - count_memcg_page_event(page, PGLAZYFREED); + count_vm_events(PGLAZYFREED, nr_pages); + count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; -- cgit From 5441d4902f969293b5bfe057e26038db1a8b342e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: move initialisation of mapping down Now that we don't interrogate the BDI for congestion, we can delay looking up the folio's mapping until we've got further through the function, reducing register pressure and saving a call to folio_mapping for folios we're adding to the swap cache. Link: https://lkml.kernel.org/r/20220504182857.4013401-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 972557e2fb92..725a291e9dad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1588,12 +1588,11 @@ retry: stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if the underlying BDI is or if + * Treat this page as congested if * pages are cycling through the LRU so quickly that the * pages marked for immediate reclaim are making it to the * end of the LRU a second time. */ - mapping = page_mapping(page); if (writeback && PageReclaim(page)) stat->nr_congested += nr_pages; @@ -1744,9 +1743,6 @@ retry: if (!add_to_swap(folio)) goto activate_locked_split; } - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); } } else if (PageSwapBacked(page) && PageTransHuge(page)) { /* Split shmem THP */ @@ -1787,6 +1783,7 @@ retry: } } + mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios -- cgit From 246b648038096c6024a812aac354d27e8da987a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: convert the activate_locked portion of shrink_page_list to folios This accounts the number of pages activated correctly for large folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 725a291e9dad..0a527cebdff2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1946,15 +1946,16 @@ activate_locked_split: } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || - PageMlocked(page))) - try_to_free_swap(page); - VM_BUG_ON_PAGE(PageActive(page), page); - if (!PageMlocked(page)) { - int type = page_is_file_lru(page); - SetPageActive(page); + if (folio_test_swapcache(folio) && + (mem_cgroup_swap_full(&folio->page) || + folio_test_mlocked(folio))) + try_to_free_swap(&folio->page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + if (!folio_test_mlocked(folio)) { + int type = folio_is_file_lru(folio); + folio_set_active(folio); stat->nr_activate[type] += nr_pages; - count_memcg_page_event(page, PGACTIVATE); + count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: unlock_page(page); -- cgit From c28a0e9695b724fbaa58b1f5bbf0a03c5a79d721 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: remove remaining uses of page in shrink_page_list These are all straightforward conversions to the folio API. Link: https://lkml.kernel.org/r/20220504182857.4013401-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 122 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 60 insertions(+), 62 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0a527cebdff2..24dbe04520cb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1507,11 +1507,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return nr_succeeded; } -static bool may_enter_fs(struct page *page, gfp_t gfp_mask) +static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) { if (gfp_mask & __GFP_FS) return true; - if (!PageSwapCache(page) || !(gfp_mask & __GFP_IO)) + if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) return false; /* * We can "enter_fs" for swap-cache with only __GFP_IO @@ -1520,7 +1520,7 @@ static bool may_enter_fs(struct page *page, gfp_t gfp_mask) * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ - return !data_race(page_swap_flags(page) & SWP_FS_OPS); + return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS); } /* @@ -1547,7 +1547,6 @@ static unsigned int shrink_page_list(struct list_head *page_list, retry: while (!list_empty(page_list)) { struct address_space *mapping; - struct page *page; struct folio *folio; enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; @@ -1557,19 +1556,18 @@ retry: folio = lru_to_folio(page_list); list_del(&folio->lru); - page = &folio->page; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); - /* Account the number of base pages even though THP */ + /* Account the number of base pages */ sc->nr_scanned += nr_pages; - if (unlikely(!page_evictable(page))) + if (unlikely(!folio_evictable(folio))) goto activate_locked; if (!sc->may_unmap && folio_mapped(folio)) @@ -1578,7 +1576,7 @@ retry: /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing - * pages if the tail of the LRU is all dirty unqueued pages. + * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) @@ -1588,21 +1586,21 @@ retry: stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if - * pages are cycling through the LRU so quickly that the - * pages marked for immediate reclaim are making it to the - * end of the LRU a second time. + * Treat this folio as congested if folios are cycling + * through the LRU so quickly that the folios marked + * for immediate reclaim are making it to the end of + * the LRU a second time. */ - if (writeback && PageReclaim(page)) + if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of folios - * under writeback and this folio is both under - * writeback and has the reclaim flag set then it + * 1) If reclaim is encountering an excessive number + * of folios under writeback and this folio has both + * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an @@ -1651,19 +1649,19 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(page, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask)) { /* * This is slightly racy - - * folio_end_writeback() might have just - * cleared the reclaim flag, then setting - * reclaim here ends up interpreted as - * the readahead flag - but that does - * not matter enough to care. What we - * do want is for this folio to have - * the reclaim flag set next time memcg - * reclaim reaches the tests above, so - * it will then folio_wait_writeback() - * to avoid OOM; and it's also appropriate + * folio_end_writeback() might have + * just cleared the reclaim flag, then + * setting the reclaim flag here ends up + * interpreted as the readahead flag - but + * that does not matter enough to care. + * What we do want is for this folio to + * have the reclaim flag set next time + * memcg reclaim reaches the tests above, + * so it will then wait for writeback to + * avoid OOM; and it's also appropriate * in global reclaim. */ folio_set_reclaim(folio); @@ -1691,37 +1689,37 @@ retry: goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: - ; /* try to reclaim the page below */ + ; /* try to reclaim the folio below */ } /* - * Before reclaiming the page, try to relocate + * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && - (thp_migration_supported() || !PageTransHuge(page))) { - list_add(&page->lru, &demote_pages); - unlock_page(page); + (thp_migration_supported() || !folio_test_large(folio))) { + list_add(&folio->lru, &demote_pages); + folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. - * Lazyfree page could be freed directly + * Lazyfree folio could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { + if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split pages without a PMD map right + * Split folios without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ @@ -1744,20 +1742,19 @@ retry: goto activate_locked_split; } } - } else if (PageSwapBacked(page) && PageTransHuge(page)) { - /* Split shmem THP */ + } else if (folio_test_swapbacked(folio) && + folio_test_large(folio)) { + /* Split shmem folio */ if (split_folio_to_list(folio, page_list)) goto keep_locked; } /* - * THP may get split above, need minus tail pages and update - * nr_pages to avoid accounting tail pages twice. - * - * The tail pages that are added into swap cache successfully - * reach here. + * If the folio was split above, the tail pages will make + * their own pass through this function and be accounted + * then. */ - if ((nr_pages > 1) && !PageTransHuge(page)) { + if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } @@ -1815,7 +1812,7 @@ retry: if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs(page, sc->gfp_mask)) + if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; @@ -1917,11 +1914,11 @@ retry: sc->target_mem_cgroup)) goto keep_locked; - unlock_page(page); + folio_unlock(folio); free_it: /* - * THP may get swapped out in a whole, need account - * all base pages. + * Folio may get swapped out as a whole, need to account + * all pages in it. */ nr_reclaimed += nr_pages; @@ -1929,10 +1926,10 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) - destroy_compound_page(page); + if (unlikely(folio_test_large(folio))) + destroy_compound_page(&folio->page); else - list_add(&page->lru, &free_pages); + list_add(&folio->lru, &free_pages); continue; activate_locked_split: @@ -1958,18 +1955,19 @@ activate_locked: count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: - unlock_page(page); + folio_unlock(folio); keep: - list_add(&page->lru, &ret_pages); - VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); + list_add(&folio->lru, &ret_pages); + VM_BUG_ON_FOLIO(folio_test_lru(folio) || + folio_test_unevictable(folio), folio); } /* 'page_list' is always empty here */ - /* Migrate pages selected for demotion */ + /* Migrate folios selected for demotion */ nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Pages that could not be demoted are still in @demote_pages */ + /* Folios that could not be demoted are still in @demote_pages */ if (!list_empty(&demote_pages)) { - /* Pages which failed to demoted go back on @page_list for retry: */ + /* Folios which weren't demoted go back on @page_list for retry: */ list_splice_init(&demote_pages, page_list); do_demote_pass = false; goto retry; -- cgit From 6d4675e601357834dadd2ba1d803f6484596015c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm: don't be stuck to rmap lock on reclaim path The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: John Dias Cc: Tim Murray Cc: Matthew Wilcox Cc: Vladimir Davydov Cc: Martin Liu Cc: Minchan Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 24dbe04520cb..887edcd93a40 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1391,6 +1391,10 @@ static enum page_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -2499,8 +2503,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- cgit From 3f1509c57b1ba5646de0fb8d81bd7107aec22257 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 18 May 2022 15:09:11 -0400 Subject: Revert "mm/vmscan: never demote for memcg reclaim" This reverts commit 3a235693d3930e1276c8d9cc0ca5807ef292cf0a. Its premise was that cgroup reclaim cares about freeing memory inside the cgroup, and demotion just moves them around within the cgroup limit. Hence, pages from toptier nodes should be reclaimed directly. However, with NUMA balancing now doing tier promotions, demotion is part of the page aging process. Global reclaim demotes the coldest toptier pages to secondary memory, where their life continues and from which they have a chance to get promoted back. Essentially, tiered memory systems have an LRU order that spans multiple nodes. When cgroup reclaims pages coming off the toptier directly, there can be colder pages on lower tier nodes that were demoted by global reclaim. This is an aging inversion, not unlike if cgroups were to reclaim directly from the active lists while there are inactive pages. Proactive reclaim is another factor. The goal of that it is to offload colder pages from expensive RAM to cheaper storage. When lower tier memory is available as an intermediate layer, we want offloading to take advantage of it instead of bypassing to storage. Revert the patch so that cgroups respect the LRU order spanning the memory hierarchy. Of note is a specific undercommit scenario, where all cgroup limits in the system add up to <= available toptier memory. In that case, shuffling pages out to lower tiers first to reclaim them from there is inefficient. This is something could be optimized/short-circuited later on (although care must be taken not to accidentally recreate the aging inversion). Let's ensure correctness first. Link: https://lkml.kernel.org/r/20220518190911.82400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Dave Hansen Reviewed-by: Yang Shi Acked-by: Roman Gushchin Reviewed-by: "Huang, Ying" Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Tim Chen Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 887edcd93a40..76bca20679d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -528,13 +528,8 @@ static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) return false; - if (sc) { - if (sc->no_demotion) - return false; - /* It is pointless to do demotion in memcg reclaim */ - if (cgroup_reclaim(sc)) - return false; - } + if (sc && sc->no_demotion) + return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; -- cgit