diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/Makefile | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 180 | ||||
-rw-r--r-- | mm/compaction.c | 11 | ||||
-rw-r--r-- | mm/huge_memory.c | 83 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 13 | ||||
-rw-r--r-- | mm/memblock.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 266 | ||||
-rw-r--r-- | mm/memory-failure.c | 126 | ||||
-rw-r--r-- | mm/memory.c | 103 | ||||
-rw-r--r-- | mm/mempolicy.c | 18 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nobootmem.c | 435 | ||||
-rw-r--r-- | mm/page_alloc.c | 98 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 1 | ||||
-rw-r--r-- | mm/rmap.c | 54 | ||||
-rw-r--r-- | mm/shmem.c | 15 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 13 | ||||
-rw-r--r-- | mm/vmscan.c | 40 |
24 files changed, 1003 insertions, 512 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3ad483bdf505..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. diff --git a/mm/Makefile b/mm/Makefile index 2b1b575ae712..42a8326c3e3d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ @@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793c..07aeb89e396e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,6 +23,13 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; @@ -35,7 +42,6 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif -#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -146,7 +152,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -#endif + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -171,53 +177,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } -#ifdef CONFIG_NO_BOOTMEM -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int i; - unsigned long start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); - - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); - - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); -} - -unsigned long __init free_all_memory_core_early(int nodeid) -{ - int i; - u64 start, end; - unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); - } - - return count; -} -#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -278,7 +237,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -289,12 +247,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); -#ifdef CONFIG_NO_BOOTMEM - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -#else return free_all_bootmem_core(pgdat->bdata); -#endif } /** @@ -304,16 +257,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { -#ifdef CONFIG_NO_BOOTMEM - /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed - * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related - */ - return free_all_memory_core_early(MAX_NUMNODES); -#else unsigned long total_pages = 0; bootmem_data_t *bdata; @@ -321,10 +264,8 @@ unsigned long __init free_all_bootmem(void) total_pages += free_all_bootmem_core(bdata); return total_pages; -#endif } -#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -419,7 +360,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } -#endif /** * free_bootmem_node - mark a page range as usable @@ -434,10 +374,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); -#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -446,7 +382,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -#endif } /** @@ -460,10 +395,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); -#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -472,7 +403,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); -#endif } /** @@ -489,17 +419,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -#endif } /** @@ -515,20 +440,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); -#endif } -#ifndef CONFIG_NO_BOOTMEM int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { @@ -685,33 +604,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } -#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -#ifdef CONFIG_NO_BOOTMEM - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -#else bootmem_data_t *bdata; void *region; @@ -737,7 +635,6 @@ restart: } return NULL; -#endif } /** @@ -758,10 +655,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem_nopanic(size, align, goal, limit); } @@ -798,14 +691,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem(size, align, goal, limit); } -#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -822,7 +710,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } -#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -842,24 +729,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); -#endif - - return ptr; + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -880,13 +753,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - new_goal, -1ULL); -#else ptr = alloc_bootmem_core(pgdat->bdata, size, align, new_goal, 0); -#endif if (ptr) return ptr; } @@ -907,16 +775,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { -#ifdef CONFIG_NO_BOOTMEM - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -926,7 +784,6 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); -#endif } #endif @@ -938,16 +795,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); -#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); -#endif if (ptr) return ptr; @@ -995,21 +847,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#endif - return ptr; } diff --git a/mm/compaction.c b/mm/compaction.c index 6d592a021072..8be430b812de 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -406,6 +406,10 @@ static int compact_finished(struct zone *zone, if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ if (cc->order == -1) return COMPACT_CONTINUE; @@ -454,6 +458,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_SKIPPED; /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + + /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 004c9c2aac78..113e35c47502 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA @@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { @@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); else new_page = NULL; @@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page) /* after clearing PageTail the gup refcount can be released */ smp_mb(); - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | @@ -1203,6 +1208,8 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); + mem_cgroup_split_huge_fixup(page, page_tail); + lru_add_page_tail(zone, page, page_tail); } @@ -1738,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1754,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm, #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + return; + } #else VM_BUG_ON(*hpage); /* @@ -1766,18 +1778,19 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); return; } -#endif if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { up_read(&mm->mmap_sem); put_page(new_page); return; } +#endif /* after allocating the hugepage upgrade to mmap_sem write mode */ up_read(&mm->mmap_sem); @@ -1804,6 +1817,8 @@ static void collapse_huge_page(struct mm_struct *mm, /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ if (!vma->anon_vma || vma->vm_ops || vma->vm_file) goto out; + if (is_vma_temporary_stack(vma)) + goto out; VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); pgd = pgd_offset(mm, address); @@ -1837,15 +1852,14 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); spin_unlock(ptl); - pte_unmap(pte); if (unlikely(!isolated)) { + pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); set_pmd_at(mm, address, pmd, _pmd); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); - mem_cgroup_uncharge_page(new_page); goto out; } @@ -1856,6 +1870,7 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_unlock(vma->anon_vma); __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); VM_BUG_ON(page_count(pgtable) != 1); @@ -1890,6 +1905,7 @@ out_up_write: return; out: + mem_cgroup_uncharge_page(new_page); #ifdef CONFIG_NUMA put_page(new_page); #endif @@ -1909,6 +1925,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1939,6 +1956,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1955,7 +1979,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } @@ -2024,32 +2048,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { - khugepaged_scan.address = vma->vm_end; - progress++; - continue; - } + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) { - progress++; - continue; - } + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; - if (khugepaged_scan.address > hend) { - khugepaged_scan.address = hend + HPAGE_PMD_SIZE; - progress++; - continue; - } - BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { int ret; @@ -2078,7 +2097,7 @@ breakouterloop: breakouterloop_mmap_sem: spin_lock(&khugepaged_mm_lock); - BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2233,9 +2252,9 @@ static int khugepaged(void *none) for (;;) { mutex_unlock(&khugepaged_mutex); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) diff --git a/mm/internal.h b/mm/internal.h index 69488205723d..3438dd43a062 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking); - #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..84225f3b7190 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } diff --git a/mm/memblock.c b/mm/memblock.c index 400dc62697d7..4618fda975a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, BUG_ON(0 == size); - size = memblock_align_up(size, align); - /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; @@ -683,13 +681,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { - int idx = memblock_search(&memblock.reserved, base); + int idx = memblock_search(&memblock.memory, base); if (idx == -1) return 0; - return memblock.reserved.regions[idx].base <= base && - (memblock.reserved.regions[idx].base + - memblock.reserved.regions[idx].size) >= (base + size); + return memblock.memory.regions[idx].base <= base && + (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= (base + size); } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ab841031436..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -600,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) + bool file, int nr_pages) { - int val = (charge) ? 1 : -1; - preempt_disable(); - if (PageCgroupCache(pc)) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + if (file) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - if (charge) + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); - __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); preempt_enable(); } @@ -815,7 +816,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) return; VM_BUG_ON(list_empty(&pc->lru)); @@ -836,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -857,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) += 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) return; @@ -1030,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; @@ -1119,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1615,7 +1626,7 @@ void mem_cgroup_update_page_stat(struct page *page, if (unlikely(!mem || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem))) { + if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ move_lock_page_cgroup(pc, &flags); need_unlock = true; @@ -1840,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* @@ -2084,14 +2107,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return mem; } -/* - * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be - * USED state. If already USED, uncharge and return. - */ -static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { + int nr_pages = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2115,43 +2151,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, pc, true); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); + unlock_page_cgroup(pc); + /* + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + memcg_check_events(mem, pc->page); } -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype, - int page_size) -{ - int i; - int count = page_size >> PAGE_SHIFT; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ + (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compund_lock. + */ +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *tail_pc = lookup_page_cgroup(tail); + unsigned long flags; - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); + if (mem_cgroup_disabled()) return; - } - /* - * we don't need page_cgroup_lock about tail pages, becase they are not - * accessed by any other context at this point. + * We have no races with charge/uncharge but will have races with + * page state accounting. */ - for (i = 0; i < count; i++) - ____mem_cgroup_commit_charge(mem, pc + i, ctype); + move_lock_page_cgroup(head_pc, &flags); - unlock_page_cgroup(pc); - /* - * "charge_statistics" updated event counter. Then, check it. - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. - */ - memcg_check_events(mem, pc->page); + tail_pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb(); /* see __commit_charge() */ + if (PageCgroupAcctLRU(head_pc)) { + enum lru_list lru; + struct mem_cgroup_per_zone *mz; + + /* + * LRU flags cannot be copied because we need to add tail + *.page to LRU by generic call and our hook will be called. + * We hold lru_lock, then, reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc); + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + } + tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + move_unlock_page_cgroup(head_pc, &flags); } +#endif /** * __mem_cgroup_move_account - move account of the page @@ -2171,8 +2221,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, */ static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, + int charge_size) { + int nr_pages = charge_size >> PAGE_SHIFT; + VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); VM_BUG_ON(!page_is_cgroup_locked(pc)); @@ -2186,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, pc, false); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from, PAGE_SIZE); + mem_cgroup_cancel_charge(from, charge_size); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, pc, true); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2208,15 +2261,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * __mem_cgroup_move_account() */ static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, + bool uncharge, int charge_size) { int ret = -EINVAL; unsigned long flags; + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) + return -EBUSY; lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { move_lock_page_cgroup(pc, &flags); - __mem_cgroup_move_account(pc, from, to, uncharge); + __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); move_unlock_page_cgroup(pc, &flags); ret = 0; } @@ -2241,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; + int page_size = PAGE_SIZE; + unsigned long flags; int ret; /* Is ROOT ? */ @@ -2253,15 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; + parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, - PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent, true); + if (page_size > PAGE_SIZE) + flags = compound_lock_irqsave(page); + + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent, PAGE_SIZE); + mem_cgroup_cancel_charge(parent, page_size); + + if (page_size > PAGE_SIZE) + compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); put: @@ -2280,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; - int page_size = PAGE_SIZE; if (PageTransHuge(page)) { page_size <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; } pc = lookup_page_cgroup(page); @@ -2295,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; @@ -2546,7 +2625,6 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - int i; int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; @@ -2596,8 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - for (i = 0; i < count; i++) - mem_cgroup_charge_statistics(mem, pc + i, false); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); ClearPageCgroupUsed(pc); /* @@ -4844,7 +4921,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false)) { + mc.from, mc.to, false, PAGE_SIZE)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4983,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -4993,7 +5070,8 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 548fbd70f026..99ccb4472623 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (!PageHuge(page) && unlikely(split_huge_page(page))) - return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); - ret = try_to_unmap(hpage, ttu); + if (hpage != ppage) + lock_page_nosync(ppage); + + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* @@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); @@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { - putback_lru_pages(&pagelist); + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { + putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1437,35 +1487,3 @@ done: /* keep elevated page count for bad page */ return ret; } - -/* - * The caller must hold current->mm->mmap_sem in read mode. - */ -int is_hwpoison_address(unsigned long addr) -{ - pgd_t *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t pte, *ptep; - swp_entry_t entry; - - pgdp = pgd_offset(current->mm, addr); - if (!pgd_present(*pgdp)) - return 0; - pudp = pud_offset(pgdp, addr); - pud = *pudp; - if (!pud_present(pud) || pud_large(pud)) - return 0; - pmdp = pmd_offset(pudp, addr); - pmd = *pmdp; - if (!pmd_present(pmd) || pmd_large(pmd)) - return 0; - ptep = pte_offset_map(pmdp, addr); - pte = *ptep; - pte_unmap(ptep); - if (!is_swap_pte(pte)) - return 0; - entry = pte_to_swp_entry(pte); - return is_hwpoison_entry(entry); -} -EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index 31250faff390..e48945ab362b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1410,6 +1410,55 @@ no_page_table: return page; } +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, @@ -1527,9 +1576,16 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| - VM_FAULT_SIGBUS)) + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } @@ -1578,6 +1634,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (nr_pages); return i; } +EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory @@ -2115,10 +2172,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page + * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault * must check under lock before unmapping the pte and proceeding * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). + * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) @@ -2219,7 +2276,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); @@ -2289,7 +2345,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2316,7 +2371,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * do_no_page is protected similarly. + * __do_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -2367,16 +2422,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2444,10 +2489,20 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } return ret; oom_free_new: page_cache_release(new_page); @@ -2650,6 +2705,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2666,6 +2722,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); @@ -3053,12 +3110,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..78062ab641ff 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -993,7 +993,7 @@ int do_migrate_pages(struct mm_struct *mm, * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the + * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy @@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } diff --git a/mm/migrate.c b/mm/migrate.c index 46fe8cc13d67..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -772,6 +772,7 @@ uncharge: unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -785,8 +786,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. @@ -888,7 +887,7 @@ out: * are movable anymore because to has become empty * or no retryable pages exist anymore. * Caller should call putback_lru_pages to return pages to the LRU - * or free list. + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ @@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, } rc = 0; out: - - list_for_each_entry_safe(page, page2, from, lru) - put_page(page); - if (rc) return rc; @@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 13e81ee8be9d..c3924c7f00be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 000000000000..e2bdb07079ce --- /dev/null +++ b/mm/nobootmem.c @@ -0,0 +1,435 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/kmemleak.h> +#include <linux/range.h> +#include <linux/memblock.h> + +#include <asm/bug.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +#ifdef CONFIG_CRASH_DUMP +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; +#endif + +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549fd..7945247b1e53 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,7 +286,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ return; } @@ -317,7 +317,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } @@ -2034,6 +2036,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; @@ -3687,13 +3699,45 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; @@ -3736,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -4797,15 +4813,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, @@ -5364,10 +5371,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0369f5b3ba1b..eb663fb533e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -6,6 +6,7 @@ * Copyright (C) 2010 Linus Torvalds */ +#include <linux/pagemap.h> #include <asm/tlb.h> #include <asm-generic/pgtable.h> diff --git a/mm/rmap.c b/mm/rmap.c index f21f4a1d6a1c..941bf82e8961 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int referenced = 0; - /* - * Don't want to elevate referenced for mlocked page that gets this far, - * in order that it progresses to try_to_unmap and is moved to the - * unevictable list. - */ - if (vma->vm_flags & VM_LOCKED) { - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; - } - - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - if (unlikely(PageTransHuge(page))) { pmd_t *pmd; spin_lock(&mm->page_table_lock); + /* + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). + */ pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG); - if (pmd && !pmd_trans_splitting(*pmd) && - pmdp_clear_flush_young_notify(vma, address, pmd)) + if (!pmd) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(&mm->page_table_lock); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; spin_unlock(&mm->page_table_lock); } else { pte_t *pte; spinlock_t *ptl; + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + (*mapcount)--; if (referenced) diff --git a/mm/shmem.c b/mm/shmem.c index 5ee67c990602..048a95a5244d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -779,7 +779,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * If truncating down to a partial page, then * if that page is already allocated, hold it * in memory until the truncation is over, so - * truncate_partial_page cannnot miss it were + * truncate_partial_page cannot miss it were * it assigned to swap. */ if (newsize & (PAGE_CACHE_SIZE-1)) { @@ -1843,8 +1843,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, + &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1983,8 +1984,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -2144,8 +2145,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, { struct inode *inode = dentry->d_inode; - if (*len < 3) + if (*len < 3) { + *len = 3; return 255; + } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); + bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { diff --git a/mm/truncate.c b/mm/truncate.c index 3c2d5ddfa0d4..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } @@ -549,13 +551,12 @@ EXPORT_SYMBOL(truncate_pagecache); * @inode: inode * @newsize: new file size * - * truncate_setsize updastes i_size update and performs pagecache - * truncation (if necessary) for a file size updates. It will be - * typically be called from the filesystem's setattr function when - * ATTR_SIZE is passed in. + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and after all filesystem - * specific block truncation has been performed. + * Must be called with inode_mutex held and before all filesystem specific + * block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 47a50962ce81..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,7 +41,6 @@ #include <linux/memcontrol.h> #include <linux/delayacct.h> #include <linux/sysctl.h> -#include <linux/compaction.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1842,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; - /* - * If we failed to reclaim and have scanned the full list, stop. - * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far - * faster but obviously would be less likely to succeed - * allocation. If this is desirable, use GFP_REPEAT to decide - * if both reclaimed and scanned should be checked or just - * reclaimed - */ - if (!nr_reclaimed && !nr_scanned) - return false; + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } /* * If we have not reclaimed enough pages for compaction and the @@ -1883,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - unsigned long nr_scanned = sc->nr_scanned; restart: nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -2084,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } |