diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 136 |
1 files changed, 96 insertions, 40 deletions
diff --git a/mm/memory.c b/mm/memory.c index 793fe0f9841c..c264f7cd3e47 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1172,7 +1174,6 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } @@ -1213,11 +1214,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } @@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1637,8 +1633,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); @@ -1649,11 +1645,15 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { + pgprot_t pgprot = vma->vm_page_prot; + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + track_pfn_insert(vma, &pgprot, pfn); + /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* @@ -1670,9 +1670,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + return insert_pfn(vma, addr, pfn, pgprot); } EXPORT_SYMBOL(vm_insert_mixed); @@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; @@ -3395,7 +3435,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pte_write(pte)) flags |= TNF_NO_GROUP; /* @@ -3450,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3658,6 +3698,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + /* + * This mm has been already reaped by the oom reaper and so the + * refault cannot be trusted in general. Anonymous refaults would + * lose data and give a zero page instead e.g. This is especially + * problem for use_mm() because regular tasks will just die and + * the corrupted data will not be visible anywhere while kthread + * will outlive the oom victim and potentially propagate the date + * further. + */ + if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) + && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) + ret = VM_FAULT_SIGBUS; + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); @@ -3852,10 +3905,11 @@ EXPORT_SYMBOL_GPL(generic_access_phys); * given task for page fault accounting. */ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, int write) + unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; + int write = gup_flags & FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3865,7 +3919,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + gup_flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; @@ -3917,14 +3971,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, write); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* @@ -3933,7 +3987,7 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; @@ -3942,11 +3996,13 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + mmput(mm); return ret; } +EXPORT_SYMBOL_GPL(access_process_vm); /* * Print the name of a VMA. |