diff options
Diffstat (limited to 'mm')
44 files changed, 656 insertions, 754 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f73f5b272144..e59cf5fe5ce9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,14 +639,6 @@ config BOUNCE memory available to the CPU. Enabled by default when HIGHMEM is selected, but you may say n to override this. -config VIRT_TO_BUS - bool - help - An architecture should select this if it implements the - deprecated interface virt_to_bus(). All new architectures - should probably not select this. - - config MMU_NOTIFIER bool select SRCU diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 4b8eab4b3f45..22c96fed70b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page) spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } - /* move_to_new_page() counterpart for a ballooned page */ -static int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, struct page *page, +static int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); @@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping, return balloon->migratepage(balloon, newpage, page, mode); } -const struct address_space_operations balloon_aops = { - .migratepage = balloon_page_migrate, +const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, }; -EXPORT_SYMBOL_GPL(balloon_aops); +EXPORT_SYMBOL_GPL(balloon_mops); #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/compaction.c b/mm/compaction.c index d024d18e0b5c..640fa76228dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list) } #ifdef CONFIG_COMPACTION - -int PageMovable(struct page *page) +bool PageMovable(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops; VM_BUG_ON_PAGE(!PageLocked(page), page); if (!__PageMovable(page)) - return 0; + return false; - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) - return 1; + mops = page_movable_ops(page); + if (mops) + return true; - return 0; + return false; } EXPORT_SYMBOL(PageMovable); -void __SetPageMovable(struct page *page, struct address_space *mapping) +void __SetPageMovable(struct page *page, const struct movable_operations *mops) { VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); - page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); + VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); } EXPORT_SYMBOL(__SetPageMovable); @@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page) { VM_BUG_ON_PAGE(!PageMovable(page), page); /* - * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE - * flag so that VM can catch up released page by driver after isolation. - * With it, VM migration doesn't try to put it back. + * This page still has the type of a movable page, but it's + * actually not movable any more. */ - page->mapping = (void *)((unsigned long)page->mapping & - PAGE_MAPPING_MOVABLE); + page->mapping = (void *)PAGE_MAPPING_MOVABLE; } EXPORT_SYMBOL(__ClearPageMovable); @@ -1035,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Only pages without mappings or that have a - * ->migratepage callback are possible to migrate + * ->migrate_folio callback are possible to migrate * without blocking. However, we can be racing with * truncation so it's necessary to lock the page * to stabilise the mapping as truncation holds @@ -1046,7 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail_put; mapping = page_mapping(page); - migrate_dirty = !mapping || mapping->a_ops->migratepage; + migrate_dirty = !mapping || + mapping->a_ops->migrate_folio; unlock_page(page); if (!migrate_dirty) goto isolate_fail_put; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 59e1653799f8..3c7b9d6dca95 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -336,8 +336,7 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, if (pte_young(entry)) { referenced = true; entry = pte_mkold(entry); - huge_ptep_set_access_flags(vma, addr, pte, entry, - vma->vm_flags & VM_WRITE); + set_huge_pte_at(mm, addr, pte, entry); } #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/filemap.c b/mm/filemap.c index cd59f055e29d..15800334147b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -923,26 +923,6 @@ error: } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); -/** - * add_to_page_cache_locked - add a locked page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add a page to the pagecache. It must be locked. - * This function does not add the page to the LRU. The caller must do that. - * - * Return: %0 on success, negative error code otherwise. - */ -int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) -{ - return __filemap_add_folio(mapping, page_folio(page), offset, - gfp_mask, NULL); -} -EXPORT_SYMBOL(add_to_page_cache_locked); - int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { @@ -1982,6 +1962,10 @@ no_page: gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; + if (fgp_flags & FGP_NOWAIT) { + gfp &= ~GFP_KERNEL; + gfp |= GFP_NOWAIT | __GFP_NOWARN; + } folio = filemap_alloc_folio(gfp, 0); if (!folio) @@ -2141,65 +2125,46 @@ put: return folio_batch_count(fbatch); } -static inline -bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) -{ - if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - return false; - if (index >= max) - return false; - return index < folio->index + folio_nr_pages(folio) - 1; -} - /** - * find_get_pages_range - gang pagecache lookup + * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed + * @fbatch: The batch to fill. * - * find_get_pages_range() will search for and return a group of up to @nr_pages - * pages in the mapping starting at index @start and up to index @end - * (inclusive). The pages are placed at @pages. find_get_pages_range() takes - * a reference against the returned pages. + * Search for and return a batch of folios in the mapping starting at + * index @start and up to index @end (inclusive). The folios are returned + * in @fbatch with an elevated reference count. * - * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. - * We also update @start to index the next page for the traversal. + * The first folio may start before @start; if it does, it will contain + * @start. The final folio may extend beyond @end; if it does, it will + * contain @end. The folios have ascending indices. There may be gaps + * between the folios if there are indices which have no folio in the + * page cache. If folios are added to or removed from the page cache + * while this is running, they may or may not be found by this call. * - * Return: the number of pages which were found. If this number is - * smaller than @nr_pages, the end of specified range has been - * reached. + * Return: The number of folios which were found. + * We also update @start to index the next folio for the traversal. */ -unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, - pgoff_t end, unsigned int nr_pages, - struct page **pages) +unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; - unsigned ret = 0; - - if (unlikely(!nr_pages)) - return 0; rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { /* Skip over shadow, swap and DAX entries */ if (xa_is_value(folio)) continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); -again: - pages[ret] = folio_file_page(folio, xas.xa_index); - if (++ret == nr_pages) { - *start = xas.xa_index + 1; + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; goto out; } - if (folio_more_pages(folio, xas.xa_index, end)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; - } } /* @@ -2215,7 +2180,18 @@ again: out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios); + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; } /** @@ -2403,7 +2379,7 @@ retry: rcu_read_unlock(); } -static int filemap_read_folio(struct file *file, struct address_space *mapping, +static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { int error; @@ -2415,7 +2391,7 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, */ folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->read_folio(file, folio); + error = filler(file, folio); if (error) return error; @@ -2424,7 +2400,8 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, return error; if (folio_test_uptodate(folio)) return 0; - shrink_readahead_size_eio(&file->f_ra); + if (file) + shrink_readahead_size_eio(&file->f_ra); return -EIO; } @@ -2497,7 +2474,8 @@ static int filemap_update_page(struct kiocb *iocb, if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_folio(iocb->ki_filp, mapping, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); goto unlock_mapping; unlock: folio_unlock(folio); @@ -2540,7 +2518,7 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping, folio); + error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (error) goto error; @@ -3224,7 +3202,7 @@ page_not_uptodate: * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = filemap_read_folio(file, mapping, folio); + error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); @@ -3514,20 +3492,7 @@ repeat: return ERR_PTR(err); } -filler: - err = filler(file, folio); - if (err < 0) { - folio_put(folio); - return ERR_PTR(err); - } - - folio_wait_locked(folio); - if (!folio_test_uptodate(folio)) { - folio_put(folio); - return ERR_PTR(-EIO); - } - - goto out; + goto filler; } if (folio_test_uptodate(folio)) goto out; @@ -3550,14 +3515,14 @@ filler: goto out; } - /* - * A previous I/O error may have been due to temporary - * failures. - * Clear page error before actual read, PG_error will be - * set again if read page fails. - */ - folio_clear_error(folio); - goto filler; +filler: + err = filemap_read_folio(file, filler, folio); + if (err) { + folio_put(folio); + if (err == AOP_TRUNCATED_PAGE) + goto repeat; + return ERR_PTR(err); + } out: folio_mark_accessed(folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 20bc15b57d93..458618c7302c 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -51,28 +51,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -#ifdef CONFIG_MIGRATION -int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, int extra_count) -{ - return folio_migrate_mapping(mapping, page_folio(newpage), - page_folio(page), extra_count); -} -EXPORT_SYMBOL(migrate_page_move_mapping); - -void migrate_page_states(struct page *newpage, struct page *page) -{ - folio_migrate_flags(page_folio(newpage), page_folio(page)); -} -EXPORT_SYMBOL(migrate_page_states); - -void migrate_page_copy(struct page *newpage, struct page *page) -{ - folio_migrate_copy(page_folio(newpage), page_folio(page)); -} -EXPORT_SYMBOL(migrate_page_copy); -#endif - bool set_page_writeback(struct page *page) { return folio_start_writeback(page_folio(page)); @@ -87,7 +87,8 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - folio_put_refs(folio, refs); + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); goto retry; } @@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - folio_put_refs(folio, refs); + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); } /** diff --git a/mm/highmem.c b/mm/highmem.c index e92a7ceb30e8..c707d7202d5f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -561,7 +561,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(__kmap_local_page_prot); -void kunmap_local_indexed(void *vaddr) +void kunmap_local_indexed(const void *vaddr) { unsigned long addr = (unsigned long) vaddr & PAGE_MASK; pte_t *kmap_pte; @@ -212,14 +212,6 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool hmm_is_device_private_entry(struct hmm_range *range, - swp_entry_t entry) -{ - return is_device_private_entry(entry) && - pfn_swap_entry_to_page(entry)->pgmap->owner == - range->dev_private_owner; -} - static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { @@ -252,10 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); /* - * Never fault in device private pages, but just report - * the PFN even if not present. + * Don't fault in device private pages owned by the caller, + * just report the PFN. */ - if (hmm_is_device_private_entry(range, entry)) { + if (is_device_private_entry(entry) && + pfn_swap_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; @@ -273,6 +267,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!non_swap_entry(entry)) goto fault; + if (is_device_private_entry(entry)) + goto fault; + if (is_device_exclusive_entry(entry)) goto fault; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 814020689d3e..8a7c1b344abe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,6 +18,7 @@ #include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/swapops.h> +#include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/khugepaged.h> #include <linux/freezer.h> @@ -2485,11 +2486,15 @@ static void __split_huge_page(struct page *page, struct list_head *list, __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { - ClearPageDirty(head + i); - __delete_from_page_cache(head + i, NULL); + struct folio *tail = page_folio(head + i); + if (shmem_mapping(head->mapping)) shmem_uncharge(head->mapping->host, 1); - put_page(head + i); + else if (folio_test_clear_dirty(tail)) + folio_account_cleaned(tail, + inode_to_wb(folio->mapping->host)); + __filemap_remove_folio(tail, NULL); + folio_put(tail); } else if (!PageAnon(page)) { __xa_store(&head->mapping->i_pages, head[i].index, head + i, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb763f5d30b9..f044962ad9df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4787,8 +4787,13 @@ again: * sharing with another vma. */ ; - } else if (unlikely(is_hugetlb_entry_migration(entry) || - is_hugetlb_entry_hwpoisoned(entry))) { + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + bool uffd_wp = huge_pte_uffd_wp(entry); + + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = huge_pte_uffd_wp(entry); @@ -5417,19 +5422,25 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); - int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + int err; - if (err) + __folio_set_locked(folio); + err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); + + if (unlikely(err)) { + __folio_clear_locked(folio); return err; + } ClearHPageRestoreReserve(page); /* - * set page dirty so that it will not be removed from cache/file + * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ - set_page_dirty(page); + folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -5950,6 +5961,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) { + put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; diff --git a/mm/internal.h b/mm/internal.h index caebaeb2e5c9..785409805ed7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -862,6 +862,8 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +extern bool mirrored_kernelcore; + static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* diff --git a/mm/ioremap.c b/mm/ioremap.c index 5fe598ecd9b7..8652426282cc 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -11,29 +11,35 @@ #include <linux/io.h> #include <linux/export.h> -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { unsigned long offset, vaddr; phys_addr_t last_addr; struct vm_struct *area; /* Disallow wrap-around or zero size */ - last_addr = addr + size - 1; - if (!size || last_addr < addr) + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) return NULL; /* Page-align mappings */ - offset = addr & (~PAGE_MASK); - addr -= offset; + offset = phys_addr & (~PAGE_MASK); + phys_addr -= offset; size = PAGE_ALIGN(size + offset); + if (!ioremap_allowed(phys_addr, size, prot)) + return NULL; + area = get_vm_area_caller(size, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; vaddr = (unsigned long)area->addr; + area->phys_addr = phys_addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) { + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, + __pgprot(prot))) { free_vm_area(area); return NULL; } @@ -44,6 +50,12 @@ EXPORT_SYMBOL(ioremap_prot); void iounmap(volatile void __iomem *addr) { - vunmap((void *)((unsigned long)addr & PAGE_MASK)); + void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); + + if (!iounmap_allowed(vaddr)) + return; + + if (is_vmalloc_addr(vaddr)) + vunmap(vaddr); } EXPORT_SYMBOL(iounmap); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 707c3a527fcb..69f583855c8b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -108,9 +108,10 @@ void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) return; tag = kasan_random_tag(); + kasan_unpoison(set_tag(page_address(page), tag), + PAGE_SIZE << order, init); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order, init); } void __kasan_poison_pages(struct page *page, unsigned int order, bool init) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index a4f07de21771..0e3648b603a6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -295,9 +295,22 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); - shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); - shadow_end = ALIGN(shadow_end, PAGE_SIZE); + + /* + * User Mode Linux maps enough shadow memory for all of virtual memory + * at boot, so doesn't need to allocate more on vmalloc, just clear it. + * + * The remaining CONFIG_UML checks in this file exist for the same + * reason. + */ + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_VMALLOC_INVALID, shadow_end - shadow_start); + return 0; + } + + shadow_start = PAGE_ALIGN_DOWN(shadow_start); + shadow_end = PAGE_ALIGN(shadow_end); ret = apply_to_page_range(&init_mm, shadow_start, shadow_end - shadow_start, @@ -466,6 +479,10 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, if (shadow_end > shadow_start) { size = shadow_end - shadow_start; + if (IS_ENABLED(CONFIG_UML)) { + __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start); + return; + } apply_to_existing_page_range(&init_mm, (unsigned long)shadow_start, size, kasan_depopulate_vmalloc_pte, @@ -531,6 +548,11 @@ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) if (WARN_ON(!PAGE_ALIGNED(shadow_start))) return -EINVAL; + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_SHADOW_INIT, shadow_size); + return 0; + } + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, GFP_KERNEL, @@ -554,6 +576,9 @@ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) void kasan_free_module_shadow(const struct vm_struct *vm) { + if (IS_ENABLED(CONFIG_UML)) + return; + if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d39ffb058354..c252081b11df 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -603,14 +603,6 @@ static unsigned long kfence_init_pool(void) addr += 2 * PAGE_SIZE; } - /* - * The pool is live and will never be deallocated from this point on. - * Remove the pool object from the kmemleak object tree, as it would - * otherwise overlap with allocations returned by kfence_alloc(), which - * are registered with kmemleak through the slab post-alloc hook. - */ - kmemleak_free(__kfence_pool); - return 0; } @@ -623,8 +615,16 @@ static bool __init kfence_init_pool_early(void) addr = kfence_init_pool(); - if (!addr) + if (!addr) { + /* + * The pool is live and will never be deallocated from this point on. + * Ignore the pool object from the kmemleak phys object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_ignore_phys(__pa(__kfence_pool)); return true; + } /* * Only release unprotected pages, and do not try to go back and change @@ -712,7 +712,7 @@ again: * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache - * in migrate_page_move_mapping(), it might still be our page, + * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { diff --git a/mm/memblock.c b/mm/memblock.c index b7ebf4b7e9d9..c0894c137954 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -331,7 +331,7 @@ again: NUMA_NO_NODE, flags); if (!ret && (flags & MEMBLOCK_MIRROR)) { - pr_warn("Could not allocate %pap bytes of mirrored memory\n", + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); flags &= ~MEMBLOCK_MIRROR; goto again; @@ -928,6 +928,9 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { + if (!mirrored_kernelcore) + return 0; + system_has_some_mirror = true; return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); @@ -1388,7 +1391,7 @@ again: if (flags & MEMBLOCK_MIRROR) { flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", &size); goto again; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cffc7cd24d8d..9a7a228ad04a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2050,7 +2050,7 @@ try_again: /* * Now take care of user space mappings. - * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ if (!hwpoison_user_mappings(p, pfn, flags, p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); diff --git a/mm/memory.c b/mm/memory.c index 684c4c7cd2ff..4ba73f5aa8bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3051,7 +3051,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page)); + VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); /* * Clear the pages cpupid information as the existing @@ -4377,9 +4377,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - /* See comment in handle_pte_fault() */ + /* + * See comment in handle_pte_fault() for how this scenario happens, we + * need to return NOPAGE so that we drop this page. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) - return 0; + return VM_FAULT_NOPAGE; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); @@ -4806,6 +4809,19 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) goto split; if (vmf->vma->vm_ops->huge_fault) { vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); @@ -4816,19 +4832,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vmf->vma, vmf->pud, vmf->address); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - return VM_FAULT_FALLBACK; -} - -static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) - return VM_FAULT_FALLBACK; - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } diff --git a/mm/memremap.c b/mm/memremap.c index f0955785150f..58b20c3c300b 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -509,7 +509,7 @@ void free_zone_device_page(struct page *page) } #ifdef CONFIG_FS_DAX -bool __put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page_refs(struct page *page, int refs) { if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; @@ -519,9 +519,9 @@ bool __put_devmap_managed_page(struct page *page) * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (page_ref_dec_return(page) == 1) + if (page_ref_sub_return(page, refs) == 1) wake_up_var(&page->_refcount); return true; } -EXPORT_SYMBOL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page_refs); #endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index 1649270bc1a7..6a1597c92261 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -59,7 +59,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { - struct address_space *mapping; + const struct movable_operations *mops; /* * Avoid burning cycles with pages that are yet under __free_pages(), @@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (!PageMovable(page) || PageIsolated(page)) goto out_no_isolated; - mapping = page_mapping(page); - VM_BUG_ON_PAGE(!mapping, page); + mops = page_movable_ops(page); + VM_BUG_ON_PAGE(!mops, page); - if (!mapping->a_ops->isolate_page(page, mode)) + if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ @@ -120,10 +120,9 @@ out: static void putback_movable_page(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops = page_movable_ops(page); - mapping = page_mapping(page); - mapping->a_ops->putback_page(page); + mops->putback_page(page); ClearPageIsolated(page); } @@ -352,13 +351,18 @@ unlock: } #endif -static int expected_page_refs(struct address_space *mapping, struct page *page) +static int folio_expected_refs(struct address_space *mapping, + struct folio *folio) { - int expected_count = 1; + int refs = 1; + if (!mapping) + return refs; - if (mapping) - expected_count += compound_nr(page) + page_has_private(page); - return expected_count; + refs += folio_nr_pages(folio); + if (folio_test_private(folio)) + refs++; + + return refs; } /* @@ -375,7 +379,7 @@ int folio_migrate_mapping(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(mapping, &folio->page) + extra_count; + int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); if (!mapping) { @@ -485,26 +489,26 @@ EXPORT_SYMBOL(folio_migrate_mapping); * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct folio *dst, struct folio *src) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(src)); int expected_count; xas_lock_irq(&xas); - expected_count = 2 + page_has_private(page); - if (!page_ref_freeze(page, expected_count)) { + expected_count = 2 + folio_has_private(src); + if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } - newpage->index = page->index; - newpage->mapping = page->mapping; + dst->index = src->index; + dst->mapping = src->mapping; - get_page(newpage); + folio_get(dst); - xas_store(&xas, newpage); + xas_store(&xas, dst); - page_ref_unfreeze(page, expected_count - 1); + folio_ref_unfreeze(src, expected_count - 1); xas_unlock_irq(&xas); @@ -604,34 +608,37 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ -/* - * Common logic to directly migrate a single LRU page suitable for - * pages that do not use PagePrivate/PagePrivate2. +/** + * migrate_folio() - Simple folio migration. + * @mapping: The address_space containing the folio. + * @dst: The folio to migrate the data to. + * @src: The folio containing the current data. + * @mode: How to migrate the page. + * + * Common logic to directly migrate a single LRU folio suitable for + * folios that do not use PagePrivate/PagePrivate2. * - * Pages are locked upon entry and exit. + * Folios are locked upon entry and exit. */ -int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +int migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { - struct folio *newfolio = page_folio(newpage); - struct folio *folio = page_folio(page); int rc; - BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */ + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - rc = folio_migrate_mapping(mapping, newfolio, folio, 0); + rc = folio_migrate_mapping(mapping, dst, src, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(newfolio, folio); + folio_migrate_copy(dst, src); else - folio_migrate_flags(newfolio, folio); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } -EXPORT_SYMBOL(migrate_page); +EXPORT_SYMBOL(migrate_folio); #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ @@ -672,23 +679,23 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, return true; } -static int __buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, +static int __buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode, bool check_refs) { struct buffer_head *bh, *head; int rc; int expected_count; - if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page, mode); + head = folio_buffers(src); + if (!head) + return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(mapping, page); - if (page_count(page) != expected_count) + expected_count = folio_expected_refs(mapping, src); + if (folio_ref_count(src) != expected_count) return -EAGAIN; - head = page_buffers(page); if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN; @@ -719,23 +726,22 @@ recheck_buffers: } } - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = folio_migrate_mapping(mapping, dst, src, 0); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - attach_page_private(newpage, detach_page_private(page)); + folio_attach_private(dst, folio_detach_private(src)); bh = head; do { - set_bh_page(bh, newpage, bh_offset(bh)); + set_bh_page(bh, &dst->page, bh_offset(bh)); bh = bh->b_this_page; - } while (bh != head); if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); rc = MIGRATEPAGE_SUCCESS; unlock_buffers: @@ -745,43 +751,79 @@ unlock_buffers: do { unlock_buffer(bh); bh = bh->b_this_page; - } while (bh != head); return rc; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. For example attached buffer heads are accessed only under page lock. +/** + * buffer_migrate_folio() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * This function can only be used if the underlying filesystem guarantees + * that no other references to @src exist. For example attached buffer + * heads are accessed only under the folio lock. If your filesystem cannot + * provide this guarantee, buffer_migrate_folio_norefs() may be more + * appropriate. + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, false); + return __buffer_migrate_folio(mapping, dst, src, mode, false); } -EXPORT_SYMBOL(buffer_migrate_page); +EXPORT_SYMBOL(buffer_migrate_folio); -/* - * Same as above except that this variant is more careful and checks that there - * are also no buffer head references. This function is the right one for - * mappings where buffer heads are directly looked up and referenced (such as - * block device mappings). +/** + * buffer_migrate_folio_norefs() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * Like buffer_migrate_folio() except that this variant is more careful + * and checks that there are also no buffer head references. This function + * is the right one for mappings where buffer heads are directly looked + * up and referenced (such as block device mappings). + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page_norefs(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio_norefs(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, true); + return __buffer_migrate_folio(mapping, dst, src, mode, true); } #endif +int filemap_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + int ret; + + ret = folio_migrate_mapping(mapping, dst, src, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (folio_get_private(src)) + folio_attach_private(dst, folio_detach_private(src)); + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL_GPL(filemap_migrate_folio); + /* - * Writeback a page to clean the dirty state + * Writeback a folio to clean the dirty state */ -static int writeout(struct address_space *mapping, struct page *page) +static int writeout(struct address_space *mapping, struct folio *folio) { - struct folio *folio = page_folio(page); struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, @@ -795,25 +837,25 @@ static int writeout(struct address_space *mapping, struct page *page) /* No write method for the address space */ return -EINVAL; - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) /* Someone else already triggered a write */ return -EAGAIN; /* - * A dirty page may imply that the underlying filesystem has - * the page on some queue. So the page must be clean for - * migration. Writeout may mean we loose the lock and the - * page state is no longer what we checked for earlier. + * A dirty folio may imply that the underlying filesystem has + * the folio on some queue. So the folio must be clean for + * migration. Writeout may mean we lose the lock and the + * folio state is no longer what we checked for earlier. * At this point we know that the migration attempt cannot * be successful. */ remove_migration_ptes(folio, folio, false); - rc = mapping->a_ops->writepage(page, &wbc); + rc = mapping->a_ops->writepage(&folio->page, &wbc); if (rc != AOP_WRITEPAGE_ACTIVATE) /* unlocked. Relock */ - lock_page(page); + folio_lock(folio); return (rc < 0) ? -EIO : -EAGAIN; } @@ -821,11 +863,11 @@ static int writeout(struct address_space *mapping, struct page *page) /* * Default handling if a filesystem does not provide a migration function. */ -static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +static int fallback_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (PageDirty(page)) { - /* Only writeback pages in full synchronous migration */ + if (folio_test_dirty(src)) { + /* Only writeback folios in full synchronous migration */ switch (mode) { case MIGRATE_SYNC: case MIGRATE_SYNC_NO_COPY: @@ -833,18 +875,18 @@ static int fallback_migrate_page(struct address_space *mapping, default: return -EBUSY; } - return writeout(mapping, page); + return writeout(mapping, src); } /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_test_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } /* @@ -861,32 +903,32 @@ static int fallback_migrate_page(struct address_space *mapping, static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { - struct address_space *mapping; int rc = -EAGAIN; bool is_lru = !__PageMovable(&src->page); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - mapping = folio_mapping(src); - if (likely(is_lru)) { + struct address_space *mapping = folio_mapping(src); + if (!mapping) - rc = migrate_page(mapping, &dst->page, &src->page, mode); - else if (mapping->a_ops->migratepage) + rc = migrate_folio(mapping, dst, src, mode); + else if (mapping->a_ops->migrate_folio) /* - * Most pages have a mapping and most filesystems - * provide a migratepage callback. Anonymous pages + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own - * migratepage callback. This is the most common path + * migrate_folio callback. This is the most common path * for page migration. */ - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); else - rc = fallback_migrate_page(mapping, &dst->page, - &src->page, mode); + rc = fallback_migrate_folio(mapping, dst, src, mode); } else { + const struct movable_operations *mops; + /* * In case of non-lru page, it could be released after * isolation step. In that case, we shouldn't try migration. @@ -898,8 +940,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); + mops = page_movable_ops(&src->page); + rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7feeb447e3b9..27fb37d65476 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -728,7 +728,8 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 55c2776ae699..d0d466a5c804 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; @@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; + int ret = 0; for (;;) { unsigned long now = jiffies; @@ -1628,6 +1629,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, } /* + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); + + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the wb limits are ramping up in case of !strictlimit. @@ -1657,6 +1671,7 @@ free_running: break; } + /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); @@ -1715,8 +1730,8 @@ free_running: sdtc = mdtc; } - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; + if (dirty_exceeded != wb->dirty_exceeded) + wb->dirty_exceeded = dirty_exceeded; if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) @@ -1789,6 +1804,10 @@ pause: period, pause, start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } __set_current_state(TASK_KILLABLE); wb->dirty_sleep = now; io_schedule_timeout(pause); @@ -1820,26 +1839,7 @@ pause: if (fatal_signal_pending(current)) break; } - - if (!dirty_exceeded && wb->dirty_exceeded) - wb->dirty_exceeded = 0; - - if (writeback_in_progress(wb)) - return; - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (laptop_mode) - return; - - if (nr_reclaimable > gdtc->bg_thresh) - wb_start_background_writeback(wb); + return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping: address_space which was dirtied + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * Once we're over the dirty memory limit we decrease the ratelimiting - * by a lot, to prevent individual processes from overshooting the limit - * by (ratelimit_pages) each. + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; + int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) - return; + return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); @@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(wb, current->nr_dirtied); + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); + return ret; +} + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a56b0defcd1..e5486d47406e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -434,7 +434,7 @@ static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; -static bool mirrored_kernelcore __meminitdata; +bool mirrored_kernelcore __initdata_memblock; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -2436,7 +2436,7 @@ static inline bool check_new_pcp(struct page *page, unsigned int order) } #endif /* CONFIG_DEBUG_VM */ -static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +static inline bool should_skip_kasan_unpoison(gfp_t flags) { /* Don't skip if a software KASAN mode is enabled. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC) || @@ -2448,12 +2448,10 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) return true; /* - * With hardware tag-based KASAN enabled, skip if either: - * - * 1. Memory tags have already been cleared via tag_clear_highpage(). - * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + * With hardware tag-based KASAN enabled, skip if this has been + * requested via __GFP_SKIP_KASAN_UNPOISON. */ - return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); + return flags & __GFP_SKIP_KASAN_UNPOISON; } static inline bool should_skip_init(gfp_t flags) @@ -2472,6 +2470,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + int i; set_page_private(page, 0); set_page_refcounted(page); @@ -2497,8 +2496,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * should be initialized as well). */ if (init_tags) { - int i; - /* Initialize both memory and tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2506,13 +2503,17 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* Note that memory is already initialized by the loop above. */ init = false; } - if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + if (!should_skip_kasan_unpoison(gfp_flags)) { /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); /* Note that memory is already initialized by KASAN. */ if (kasan_has_integrated_init()) init = false; + } else { + /* Ensure page_address() dereferencing does not fault. */ + for (i = 0; i != 1 << order; ++i) + page_kasan_tag_reset(page + i); } /* If memory is still not initialized, do it now. */ if (init) @@ -4048,11 +4049,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * need to be calculated. */ if (!order) { - long fast_free; + long usable_free; + long reserved; + + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); - fast_free = free_pages; - fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); - if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) return true; } diff --git a/mm/rmap.c b/mm/rmap.c index fb6b3b47f3e4..edc06c52bc82 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1896,8 +1896,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + if (folio_is_zone_device(folio)) { + /* + * Our PTE is a non-present device exclusive entry and + * calculating the subpage as for the common case would + * result in an invalid pointer. + * + * Since only PAGE_SIZE pages can currently be + * migrated, just set it to page. This will need to be + * changed when hugepage migrations to device private + * memory are supported. + */ + VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); + subpage = &folio->page; + } else { + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); + } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1989,15 +2004,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* * No need to invalidate here it will synchronize on * against the special swap migration pte. - * - * The assignment to subpage above was computed from a - * swap PTE which results in an invalid pointer. - * Since only PAGE_SIZE pages can currently be - * migrated, just set it to page. This will need to be - * changed when hugepage migrations to device private - * memory are supported. */ - subpage = &folio->page; } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { diff --git a/mm/secretmem.c b/mm/secretmem.c index 206ed6b40c1d..e3e9590c6fb3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -55,22 +55,28 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; + vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); + filemap_invalidate_lock_shared(mapping); + retry: page = find_lock_page(mapping, offset); if (!page) { page = alloc_page(gfp | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } err = set_direct_map_invalid_noflush(page); if (err) { put_page(page); - return vmf_error(err); + ret = vmf_error(err); + goto out; } __SetPageUptodate(page); @@ -86,7 +92,8 @@ retry: if (err == -EEXIST) goto retry; - return vmf_error(err); + ret = vmf_error(err); + goto out; } addr = (unsigned long)page_address(page); @@ -94,7 +101,11 @@ retry: } vmf->page = page; - return VM_FAULT_LOCKED; + ret = VM_FAULT_LOCKED; + +out: + filemap_invalidate_unlock_shared(mapping); + return ret; } static const struct vm_operations_struct secretmem_vm_ops = { @@ -133,14 +144,8 @@ static const struct file_operations secretmem_fops = { .mmap = secretmem_mmap, }; -static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) -{ - return false; -} - -static int secretmem_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int secretmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } @@ -154,20 +159,27 @@ static void secretmem_free_folio(struct folio *folio) const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, - .migratepage = secretmem_migratepage, - .isolate_page = secretmem_isolate_page, + .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); + struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; + int ret; + + filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) - return -EINVAL; + ret = -EINVAL; + else + ret = simple_setattr(mnt_userns, dentry, iattr); + + filemap_invalidate_unlock(mapping); - return simple_setattr(mnt_userns, dentry, iattr); + return ret; } static const struct inode_operations secretmem_iops = { @@ -180,11 +192,20 @@ static struct file *secretmem_file_create(unsigned long flags) { struct file *file = ERR_PTR(-ENOMEM); struct inode *inode; + const char *anon_name = "[secretmem]"; + const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); + int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); + err = security_inode_init_security_anon(inode, &qname, NULL); + if (err) { + file = ERR_PTR(err); + goto err_free_inode; + } + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) diff --git a/mm/shmem.c b/mm/shmem.c index 06871a913b49..e975fcd9d2e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -393,7 +393,7 @@ void shmem_uncharge(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; - /* nrpages adjustment done by __delete_from_page_cache() or caller */ + /* nrpages adjustment done by __filemap_remove_folio() or caller */ spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; @@ -694,7 +694,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Like add_to_page_cache_locked, but error if expected item has gone. + * Like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -868,18 +868,17 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) */ void shmem_unlock_mapping(struct address_space *mapping) { - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ - while (!mapping_unevictable(mapping)) { - if (!pagevec_lookup(&pvec, mapping, &index)) - break; - check_move_unevictable_pages(&pvec); - pagevec_release(&pvec); + while (!mapping_unevictable(mapping) && + filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { + check_move_unevictable_folios(&fbatch); + folio_batch_release(&fbatch); cond_resched(); } } @@ -3450,7 +3449,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->blocks > S64_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; @@ -3572,10 +3571,7 @@ static int shmem_reconfigure(struct fs_context *fc) raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (ctx->blocks > S64_MAX) { - err = "Number of blocks too large"; - goto out; - } + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; @@ -3860,7 +3856,7 @@ const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif .error_remove_page = shmem_error_remove_page, }; diff --git a/mm/slab.c b/mm/slab.c index 47151fb2b2d2..10e96137b44f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3223,7 +3223,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ } /* ___cache_alloc_node can fall back to other nodes */ ptr = ____cache_alloc_node(cachep, flags, nodeid); - out: +out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); init = slab_want_init_on_alloc(flags, cachep); @@ -3252,7 +3252,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) if (!objp) objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - out: +out: return objp; } #else @@ -3398,9 +3398,10 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, { bool init; + memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); + if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); - memcg_slab_free_hook(cachep, &objp, 1); __kfence_free(objp); return; } @@ -3433,7 +3434,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - memcg_slab_free_hook(cachep, &objp, 1); /* * Skip calling cache_free_alien() when the platform is not numa. @@ -3470,7 +3470,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, cachep->object_size, cachep->size, flags); return ret; @@ -3543,7 +3543,7 @@ error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); slab_post_alloc_hook(s, objcg, flags, i, p, false); - __kmem_cache_free_bulk(s, i, p); + kmem_cache_free_bulk(s, i, p); return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3557,7 +3557,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, + trace_kmalloc(_RET_IP_, ret, cachep, size, cachep->size, flags); return ret; } @@ -3582,7 +3582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, + trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, cachep->object_size, cachep->size, flags, nodeid); @@ -3601,7 +3601,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(_RET_IP_, ret, cachep, size, cachep->size, flags, nodeid); return ret; @@ -3684,7 +3684,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, ret = slab_alloc(cachep, NULL, flags, size, caller); ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, + trace_kmalloc(caller, ret, cachep, size, cachep->size, flags); return ret; diff --git a/mm/slab.h b/mm/slab.h index db9fb5c8dae7..4ec82bec15ec 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -380,15 +380,6 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); -/* - * Generic implementation of bulk operations - * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the object listed - * may be allocated or freed using these operations. - */ -void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); - static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -547,36 +538,22 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_put(objcg); } -static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct kmem_cache *s; struct obj_cgroup **objcgs; - struct obj_cgroup *objcg; - struct slab *slab; - unsigned int off; int i; if (!memcg_kmem_enabled()) return; - for (i = 0; i < objects; i++) { - if (unlikely(!p[i])) - continue; - - slab = virt_to_slab(p[i]); - /* we could be given a kmalloc_large() object, skip those */ - if (!slab) - continue; - - objcgs = slab_objcgs(slab); - if (!objcgs) - continue; + objcgs = slab_objcgs(slab); + if (!objcgs) + return; - if (!s_orig) - s = slab->slab_cache; - else - s = s_orig; + for (i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; off = obj_to_index(s, slab, p[i]); objcg = objcgs[off]; @@ -628,7 +605,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, { } -static inline void memcg_slab_free_hook(struct kmem_cache *s, +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 77c3adf40e50..17996649cfe3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -26,13 +26,12 @@ #include <linux/memcontrol.h> #include <linux/stackdepot.h> -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> - #include "internal.h" - #include "slab.h" +#define CREATE_TRACE_POINTS +#include <trace/events/kmem.h> + enum slab_state slab_state; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); @@ -105,33 +104,6 @@ static inline int kmem_cache_sanity_check(const char *name, unsigned int size) } #endif -void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) -{ - size_t i; - - for (i = 0; i < nr; i++) { - if (s) - kmem_cache_free(s, p[i]); - else - kfree(p[i]); - } -} - -int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, - void **p) -{ - size_t i; - - for (i = 0; i < nr; i++) { - void *x = p[i] = kmem_cache_alloc(s, flags); - if (!x) { - __kmem_cache_free_bulk(s, i, p); - return 0; - } - } - return i; -} - /* * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. @@ -959,7 +931,7 @@ EXPORT_SYMBOL(kmalloc_order); void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); return ret; } EXPORT_SYMBOL(kmalloc_order_trace); diff --git a/mm/slob.c b/mm/slob.c index f47811f09aca..2bd4f476c340 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,7 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, + trace_kmalloc_node(caller, ret, NULL, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,7 +516,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, + trace_kmalloc_node(caller, ret, NULL, size, PAGE_SIZE << order, gfp, node); } @@ -616,12 +616,12 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, + trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, + trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, PAGE_SIZE << get_order(c->size), flags, node); } @@ -692,16 +692,33 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { - __kmem_cache_free_bulk(s, size, p); + size_t i; + + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } EXPORT_SYMBOL(kmem_cache_free_bulk); -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + + if (!x) { + kmem_cache_free_bulk(s, i, p); + return 0; + } + } + return i; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); diff --git a/mm/slub.c b/mm/slub.c index b1281b8654bd..862dbd9af4f5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3257,7 +3257,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, s->size, gfpflags); return ret; @@ -3280,7 +3280,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru); void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } @@ -3292,7 +3292,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, + trace_kmem_cache_alloc_node(_RET_IP_, ret, s, s->object_size, s->size, gfpflags, node); return ret; @@ -3306,7 +3306,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, gfpflags, node); ret = kasan_kmalloc(s, ret, size, gfpflags); @@ -3464,9 +3464,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long tid; - /* memcg_slab_free_hook() is already called for bulk free. */ - if (!tail) - memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3526,9 +3523,10 @@ redo: } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int cnt, + void *head, void *tail, void **p, int cnt, unsigned long addr) { + memcg_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. @@ -3550,7 +3548,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; trace_kmem_cache_free(_RET_IP_, x, s->name); - slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_); + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -3591,88 +3589,67 @@ static inline int build_detached_freelist(struct kmem_cache *s, size_t size, void **p, struct detached_freelist *df) { - size_t first_skipped_index = 0; int lookahead = 3; void *object; struct folio *folio; - struct slab *slab; - - /* Always re-init detached_freelist */ - df->slab = NULL; - - do { - object = p[--size]; - /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ - } while (!object && size); - - if (!object) - return 0; + size_t same; + object = p[--size]; folio = virt_to_folio(object); if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!folio_test_slab(folio))) { free_large_kmalloc(folio, object); - p[size] = NULL; /* mark object processed */ + df->slab = NULL; return size; } /* Derive kmem_cache from object */ - slab = folio_slab(folio); - df->s = slab->slab_cache; + df->slab = folio_slab(folio); + df->s = df->slab->slab_cache; } else { - slab = folio_slab(folio); + df->slab = folio_slab(folio); df->s = cache_from_obj(s, object); /* Support for memcg */ } - if (is_kfence_address(object)) { - slab_free_hook(df->s, object, false); - __kfence_free(object); - p[size] = NULL; /* mark object processed */ - return size; - } - /* Start new detached freelist */ - df->slab = slab; - set_freepointer(df->s, object, NULL); df->tail = object; df->freelist = object; - p[size] = NULL; /* mark object processed */ df->cnt = 1; + if (is_kfence_address(object)) + return size; + + set_freepointer(df->s, object, NULL); + + same = size; while (size) { object = p[--size]; - if (!object) - continue; /* Skip processed objects */ - /* df->slab is always set at this point */ if (df->slab == virt_to_slab(object)) { /* Opportunity build freelist */ set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; - p[size] = NULL; /* mark object processed */ - + same--; + if (size != same) + swap(p[size], p[same]); continue; } /* Limit look ahead search */ if (!--lookahead) break; - - if (!first_skipped_index) - first_skipped_index = size + 1; } - return first_skipped_index; + return same; } /* Note that interrupts must be enabled when calling this function. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { - if (WARN_ON(!size)) + if (!size) return; - memcg_slab_free_hook(s, p, size); do { struct detached_freelist df; @@ -3680,7 +3657,8 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; - slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, + _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); @@ -3760,7 +3738,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, error: slub_put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); - __kmem_cache_free_bulk(s, i, p); + kmem_cache_free_bulk(s, i, p); return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -4441,7 +4419,7 @@ void *__kmalloc(size_t size, gfp_t flags) ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); ret = kasan_kmalloc(s, ret, size, flags); @@ -4475,7 +4453,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, flags, node); - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(_RET_IP_, ret, NULL, size, PAGE_SIZE << get_order(size), flags, node); @@ -4489,7 +4467,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); - trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); ret = kasan_kmalloc(s, ret, size, flags); @@ -4581,7 +4559,7 @@ void kfree(const void *x) return; } slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_); + slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -4890,6 +4868,9 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, s = find_mergeable(size, align, flags, name, ctor); if (s) { + if (sysfs_slab_alias(s, name)) + return NULL; + s->refcount++; /* @@ -4898,11 +4879,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, */ s->object_size = max(s->object_size, size); s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); - - if (sysfs_slab_alias(s, name)) { - s->refcount--; - s = NULL; - } } return s; @@ -4948,7 +4924,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) ret = slab_alloc(s, NULL, gfpflags, caller, size); /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, size, s->size, gfpflags); + trace_kmalloc(caller, ret, s, size, s->size, gfpflags); return ret; } @@ -4964,7 +4940,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, gfpflags, node); - trace_kmalloc_node(caller, ret, + trace_kmalloc_node(caller, ret, NULL, size, PAGE_SIZE << get_order(size), gfpflags, node); @@ -4979,7 +4955,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); return ret; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 98529356b9cd..5f0ed4717ed0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -78,6 +78,14 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); @@ -528,7 +536,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - pr_warn("[%lx-%lx] potential offnode page_structs\n", + pr_warn_once("[%lx-%lx] potential offnode page_structs\n", start, end - 1); } diff --git a/mm/swap.c b/mm/swap.c index 1f563d857768..9cee7f6a3809 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1055,35 +1055,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } -/** - * pagevec_lookup_range - gang pagecache lookup - * @pvec: Where the resulting pages are placed - * @mapping: The address_space to search - * @start: The starting page index - * @end: The final page index - * - * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE - * pages in the mapping starting from index @start and upto index @end - * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a - * reference against the pages in @pvec. - * - * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. We - * also update @start to index the next page for the traversal. - * - * pagevec_lookup_range() returns the number of pages which were found. If this - * number is smaller than PAGEVEC_SIZE, the end of specified range has been - * reached. - */ -unsigned pagevec_lookup_range(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *start, pgoff_t end) -{ - pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, - pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range); - unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 2a65a89b5b4d..10b94d64cc25 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -307,7 +307,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) entry.val = 0; if (folio_test_large(folio)) { - if (IS_ENABLED(CONFIG_THP_SWAP)) + if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) get_swap_pages(1, &entry, folio_nr_pages(folio)); goto out; } diff --git a/mm/swap_state.c b/mm/swap_state.c index cc9c061c2579..e166051566f4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif }; @@ -82,7 +82,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, diff --git a/mm/truncate.c b/mm/truncate.c index ab50d0d59a2a..0b0708bf935f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -443,7 +443,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of - * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * deletion (inside __filemap_remove_folio()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ diff --git a/mm/usercopy.c b/mm/usercopy.c index 4e1da708699b..c1ee15a98633 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -161,7 +161,7 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n, static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { - uintptr_t addr = (uintptr_t)ptr; + unsigned long addr = (unsigned long)ptr; unsigned long offset; struct folio *folio; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4f4892a5f767..07d3befc80e4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -246,7 +246,10 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_READ); + ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find page. */ + if (ret == -ENOENT) + ret = -EFAULT; if (ret) goto out; if (!page) { diff --git a/mm/util.c b/mm/util.c index 5df8f2db7ca9..c9439c66d8cf 100644 --- a/mm/util.c +++ b/mm/util.c @@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio_swap_entry(folio)); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); + return mapping; } EXPORT_SYMBOL(folio_mapping); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9e7d8db42918..b2b1431352dc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4852,45 +4852,57 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) } #endif +void check_move_unevictable_pages(struct pagevec *pvec) +{ + struct folio_batch fbatch; + unsigned i; + + folio_batch_init(&fbatch); + for (i = 0; i < pvec->nr; i++) { + struct page *page = pvec->pages[i]; + + if (PageTransTail(page)) + continue; + folio_batch_add(&fbatch, page_folio(page)); + } + check_move_unevictable_folios(&fbatch); +} +EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + /** - * check_move_unevictable_pages - check pages for evictability and move to - * appropriate zone lru list - * @pvec: pagevec with lru pages to check + * check_move_unevictable_folios - Move evictable folios to appropriate zone + * lru list + * @fbatch: Batch of lru folios to check. * - * Checks pages for evictability, if an evictable page is in the unevictable + * Checks folios for evictability, if an evictable folio is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function - * should be only used for lru pages. + * should be only used for lru folios. */ -void check_move_unevictable_pages(struct pagevec *pvec) +void check_move_unevictable_folios(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; - for (i = 0; i < pvec->nr; i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - int nr_pages; - - if (PageTransTail(page)) - continue; + for (i = 0; i < fbatch->nr; i++) { + struct folio *folio = fbatch->folios[i]; + int nr_pages = folio_nr_pages(folio); - nr_pages = thp_nr_pages(page); pgscanned += nr_pages; - /* block memcg migration during page moving between lru */ - if (!TestClearPageLRU(page)) + /* block memcg migration while the folio moves between lrus */ + if (!folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (page_evictable(page) && PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + if (folio_evictable(folio) && folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); pgrescued += nr_pages; } - SetPageLRU(page); + folio_set_lru(folio); } if (lruvec) { @@ -4901,4 +4913,4 @@ void check_move_unevictable_pages(struct pagevec *pvec) count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } -EXPORT_SYMBOL_GPL(check_move_unevictable_pages); +EXPORT_SYMBOL_GPL(check_move_unevictable_folios); diff --git a/mm/z3fold.c b/mm/z3fold.c index f41f8b0d9e9a..cf71da10d04e 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,15 +34,11 @@ #include <linux/node.h> #include <linux/compaction.h> #include <linux/percpu.h> -#include <linux/mount.h> -#include <linux/pseudo_fs.h> -#include <linux/fs.h> #include <linux/preempt.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> -#include <linux/magic.h> #include <linux/kmemleak.h> /* @@ -149,7 +145,6 @@ struct z3fold_header { * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release - * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -169,7 +164,6 @@ struct z3fold_pool { struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; - struct inode *inode; }; /* @@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) } } -static int z3fold_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type z3fold_fs = { - .name = "z3fold", - .init_fs_context = z3fold_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static struct vfsmount *z3fold_mnt; -static int __init z3fold_mount(void) -{ - int ret = 0; - - z3fold_mnt = kern_mount(&z3fold_fs); - if (IS_ERR(z3fold_mnt)) - ret = PTR_ERR(z3fold_mnt); - - return ret; -} - -static void z3fold_unmount(void) -{ - kern_unmount(z3fold_mnt); -} - -static const struct address_space_operations z3fold_aops; -static int z3fold_register_migration(struct z3fold_pool *pool) -{ - pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &z3fold_aops; - return 0; -} - -static void z3fold_unregister_migration(struct z3fold_pool *pool) -{ - if (pool->inode) - iput(pool->inode); -} - /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) @@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; - if (z3fold_register_migration(pool)) - goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; -out_rwq: - destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: @@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) destroy_workqueue(pool->compact_wq); destroy_workqueue(pool->release_wq); - z3fold_unregister_migration(pool); free_percpu(pool->unbuddied); kfree(pool); } +static const struct movable_operations z3fold_mops; + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -1117,11 +1060,11 @@ retry: } if (can_sleep) { lock_page(page); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } else { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } z3fold_page_lock(zhdr); @@ -1554,12 +1497,11 @@ out: return false; } -static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int z3fold_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - struct address_space *new_mapping; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); @@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa * so we only have to reinitialize it. */ INIT_LIST_HEAD(&new_zhdr->buddy); - new_mapping = page_mapping(page); __ClearPageMovable(page); get_page(newpage); @@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa spin_lock(&pool->lock); list_add(&newpage->lru, &pool->lru); spin_unlock(&pool->lock); - __SetPageMovable(newpage, new_mapping); + __SetPageMovable(newpage, &z3fold_mops); z3fold_page_unlock(new_zhdr); queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); @@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page) z3fold_page_unlock(zhdr); } -static const struct address_space_operations z3fold_aops = { +static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, - .migratepage = z3fold_page_migrate, + .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, }; @@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { - int ret; - /* * Make sure the z3fold header is not larger than the page size and * there has remaining spaces for its buddy. */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); - ret = z3fold_mount(); - if (ret) - return ret; - zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1764,7 +1699,6 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { - z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9e13fd7ee635..34f784a1604b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -41,7 +41,6 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/magic.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> @@ -59,8 +58,6 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> -#include <linux/mount.h> -#include <linux/pseudo_fs.h> #include <linux/migrate.h> #include <linux/wait.h> #include <linux/pagemap.h> @@ -177,10 +174,6 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif -#ifdef CONFIG_COMPACTION -static struct vfsmount *zsmalloc_mnt; -#endif - /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where @@ -252,7 +245,6 @@ struct zs_pool { struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION - struct inode *inode; struct work_struct free_work; #endif /* protect page/zspage migration */ @@ -271,6 +263,7 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + struct zs_pool *pool; #ifdef CONFIG_COMPACTION rwlock_t lock; #endif @@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage) } #ifdef CONFIG_COMPACTION -static int zs_register_migration(struct zs_pool *pool); -static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); @@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static int zsmalloc_mount(void) { return 0; } -static void zsmalloc_unmount(void) {} -static int zs_register_migration(struct zs_pool *pool) { return 0; } -static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} @@ -1086,6 +1073,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, create_page_chain(class, zspage, pages); init_zspage(class, zspage); + zspage->pool = pool; return zspage; } @@ -1757,33 +1745,6 @@ static void lock_zspage(struct zspage *zspage) migrate_read_unlock(zspage); } -static int zs_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type zsmalloc_fs = { - .name = "zsmalloc", - .init_fs_context = zs_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static int zsmalloc_mount(void) -{ - int ret = 0; - - zsmalloc_mnt = kern_mount(&zsmalloc_fs); - if (IS_ERR(zsmalloc_mnt)) - ret = PTR_ERR(zsmalloc_mnt); - - return ret; -} - -static void zsmalloc_unmount(void) -{ - kern_unmount(zsmalloc_mnt); -} - static void migrate_lock_init(struct zspage *zspage) { rwlock_init(&zspage->lock); @@ -1826,6 +1787,8 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static const struct movable_operations zsmalloc_mops; + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1846,7 +1809,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; - __SetPageMovable(newpage, page_mapping(oldpage)); + __SetPageMovable(newpage, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1868,8 +1831,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -static int zs_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int zs_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; @@ -1892,14 +1855,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - pool = mapping->private_data; + /* The page is locked, so this pointer must remain valid */ + zspage = get_zspage(page); + pool = zspage->pool; /* * The pool migrate_lock protects the race between zpage migration * and zs_free. */ write_lock(&pool->migrate_lock); - zspage = get_zspage(page); class = zspage_class(pool, zspage); /* @@ -1967,31 +1931,12 @@ static void zs_page_putback(struct page *page) migrate_write_unlock(zspage); } -static const struct address_space_operations zsmalloc_aops = { +static const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, - .migratepage = zs_page_migrate, + .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, }; -static int zs_register_migration(struct zs_pool *pool) -{ - pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &zsmalloc_aops; - return 0; -} - -static void zs_unregister_migration(struct zs_pool *pool) -{ - flush_work(&pool->free_work); - iput(pool->inode); -} - /* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data. @@ -2035,6 +1980,11 @@ static void kick_deferred_free(struct zs_pool *pool) schedule_work(&pool->free_work); } +static void zs_flush_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); +} + static void init_deferred_free(struct zs_pool *pool) { INIT_WORK(&pool->free_work, async_free_zspage); @@ -2046,10 +1996,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) do { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &zsmalloc_mops); unlock_page(page); } while ((page = get_next_page(page)) != NULL); } +#else +static inline void zs_flush_migration(struct zs_pool *pool) { } #endif /* @@ -2328,9 +2280,6 @@ struct zs_pool *zs_create_pool(const char *name) /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); - if (zs_register_migration(pool)) - goto err; - /* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If @@ -2352,7 +2301,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); - zs_unregister_migration(pool); + zs_flush_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < ZS_SIZE_CLASSES; i++) { @@ -2384,14 +2333,10 @@ static int __init zs_init(void) { int ret; - ret = zsmalloc_mount(); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", zs_cpu_prepare, zs_cpu_dead); if (ret) - goto hp_setup_fail; + goto out; #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); @@ -2401,8 +2346,6 @@ static int __init zs_init(void) return 0; -hp_setup_fail: - zsmalloc_unmount(); out: return ret; } @@ -2412,7 +2355,6 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif - zsmalloc_unmount(); cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); |