mirror of
https://github.com/Divested-Mobile/DivestOS-Build.git
synced 2025-01-01 19:06:25 -05:00
149 lines
5.7 KiB
Diff
149 lines
5.7 KiB
Diff
|
From 1c8544a93151329be95f702f6f4029f860b77ee7 Mon Sep 17 00:00:00 2001
|
||
|
From: Michal Hocko <mhocko@suse.com>
|
||
|
Date: Sun, 16 Oct 2016 11:55:00 +0200
|
||
|
Subject: mm, gup: close FOLL MAP_PRIVATE race
|
||
|
|
||
|
commit 19be0eaffa3ac7d8eb6784ad9bdbc7d67ed8e619 upstream.
|
||
|
|
||
|
faultin_page drops FOLL_WRITE after the page fault handler did the CoW
|
||
|
and then we retry follow_page_mask to get our CoWed page. This is racy,
|
||
|
however because the page might have been unmapped by that time and so
|
||
|
we would have to do a page fault again, this time without CoW. This
|
||
|
would cause the page cache corruption for FOLL_FORCE on MAP_PRIVATE
|
||
|
read only mappings with obvious consequences.
|
||
|
|
||
|
This is an ancient bug that was actually already fixed once by Linus
|
||
|
eleven years ago in commit 4ceb5db9757a ("Fix get_user_pages() race
|
||
|
for write access") but that was then undone due to problems on s390
|
||
|
by commit f33ea7f404e5 ("fix get_user_pages bug") because s390 didn't
|
||
|
have proper dirty pte tracking until abf09bed3cce ("s390/mm: implement
|
||
|
software dirty bits"). This wasn't a problem at the time as pointed out
|
||
|
by Hugh Dickins because madvise relied on mmap_sem for write up until
|
||
|
0a27a14a6292 ("mm: madvise avoid exclusive mmap_sem") but since then we
|
||
|
can race with madvise which can unmap the fresh COWed page or with KSM
|
||
|
and corrupt the content of the shared page.
|
||
|
|
||
|
This patch is based on the Linus' approach to not clear FOLL_WRITE after
|
||
|
the CoW page fault (aka VM_FAULT_WRITE) but instead introduces FOLL_COW
|
||
|
to note this fact. The flag is then rechecked during follow_pfn_pte to
|
||
|
enforce the page fault again if we do not see the CoWed page. Linus was
|
||
|
suggesting to check pte_dirty again as s390 is OK now. But that would
|
||
|
make backporting to some old kernels harder. So instead let's just make
|
||
|
sure that vm_normal_page sees a pure anonymous page.
|
||
|
|
||
|
This would guarantee we are seeing a real CoW page. Introduce
|
||
|
can_follow_write_pte which checks both pte_write and falls back to
|
||
|
PageAnon on forced write faults which passed CoW already. Thanks to Hugh
|
||
|
to point out that a special care has to be taken for KSM pages because
|
||
|
our COWed page might have been merged with a KSM one and keep its
|
||
|
PageAnon flag.
|
||
|
|
||
|
Fixes: 0a27a14a6292 ("mm: madvise avoid exclusive mmap_sem")
|
||
|
Reported-by: Phil "not Paul" Oester <kernel@linuxace.com>
|
||
|
Disclosed-by: Andy Lutomirski <luto@kernel.org>
|
||
|
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||
|
Signed-off-by: Michal Hocko <mhocko@suse.com>
|
||
|
[bwh: Backported to 3.2:
|
||
|
- Adjust filename, context, indentation
|
||
|
- The 'no_page' exit path in follow_page() is different, so open-code the
|
||
|
cleanup
|
||
|
- Delete a now-unused label]
|
||
|
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||
|
Signed-off-by: Zefan Li <lizefan@huawei.com>
|
||
|
---
|
||
|
include/linux/mm.h | 1 +
|
||
|
mm/memory.c | 39 ++++++++++++++++++++++++++++-----------
|
||
|
2 files changed, 29 insertions(+), 11 deletions(-)
|
||
|
|
||
|
diff --git a/include/linux/mm.h b/include/linux/mm.h
|
||
|
index ceebf63..ef706ab 100644
|
||
|
--- a/include/linux/mm.h
|
||
|
+++ b/include/linux/mm.h
|
||
|
@@ -1525,6 +1525,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
||
|
#define FOLL_MLOCK 0x40 /* mark page as mlocked */
|
||
|
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
|
||
|
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
|
||
|
+#define FOLL_COW 0x4000 /* internal GUP flag */
|
||
|
|
||
|
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
|
||
|
void *data);
|
||
|
diff --git a/mm/memory.c b/mm/memory.c
|
||
|
index 4774579..9701911 100644
|
||
|
--- a/mm/memory.c
|
||
|
+++ b/mm/memory.c
|
||
|
@@ -1447,6 +1447,24 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
|
||
|
}
|
||
|
EXPORT_SYMBOL_GPL(zap_vma_ptes);
|
||
|
|
||
|
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
|
||
|
+ unsigned int flags)
|
||
|
+{
|
||
|
+ if (pte_write(pte))
|
||
|
+ return true;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Make sure that we are really following CoWed page. We do not really
|
||
|
+ * have to care about exclusiveness of the page because we only want
|
||
|
+ * to ensure that once COWed page hasn't disappeared in the meantime
|
||
|
+ * or it hasn't been merged to a KSM page.
|
||
|
+ */
|
||
|
+ if ((flags & FOLL_FORCE) && (flags & FOLL_COW))
|
||
|
+ return page && PageAnon(page) && !PageKsm(page);
|
||
|
+
|
||
|
+ return false;
|
||
|
+}
|
||
|
+
|
||
|
/**
|
||
|
* follow_page - look up a page descriptor from a user-virtual address
|
||
|
* @vma: vm_area_struct mapping @address
|
||
|
@@ -1529,10 +1547,13 @@ split_fallthrough:
|
||
|
pte = *ptep;
|
||
|
if (!pte_present(pte))
|
||
|
goto no_page;
|
||
|
- if ((flags & FOLL_WRITE) && !pte_write(pte))
|
||
|
- goto unlock;
|
||
|
|
||
|
page = vm_normal_page(vma, address, pte);
|
||
|
+ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, flags)) {
|
||
|
+ pte_unmap_unlock(ptep, ptl);
|
||
|
+ return NULL;
|
||
|
+ }
|
||
|
+
|
||
|
if (unlikely(!page)) {
|
||
|
if ((flags & FOLL_DUMP) ||
|
||
|
!is_zero_pfn(pte_pfn(pte)))
|
||
|
@@ -1575,7 +1596,7 @@ split_fallthrough:
|
||
|
unlock_page(page);
|
||
|
}
|
||
|
}
|
||
|
-unlock:
|
||
|
+
|
||
|
pte_unmap_unlock(ptep, ptl);
|
||
|
out:
|
||
|
return page;
|
||
|
@@ -1809,17 +1830,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||
|
* The VM_FAULT_WRITE bit tells us that
|
||
|
* do_wp_page has broken COW when necessary,
|
||
|
* even if maybe_mkwrite decided not to set
|
||
|
- * pte_write. We can thus safely do subsequent
|
||
|
- * page lookups as if they were reads. But only
|
||
|
- * do so when looping for pte_write is futile:
|
||
|
- * in some cases userspace may also be wanting
|
||
|
- * to write to the gotten user page, which a
|
||
|
- * read fault here might prevent (a readonly
|
||
|
- * page might get reCOWed by userspace write).
|
||
|
+ * pte_write. We cannot simply drop FOLL_WRITE
|
||
|
+ * here because the COWed page might be gone by
|
||
|
+ * the time we do the subsequent page lookups.
|
||
|
*/
|
||
|
if ((ret & VM_FAULT_WRITE) &&
|
||
|
!(vma->vm_flags & VM_WRITE))
|
||
|
- foll_flags &= ~FOLL_WRITE;
|
||
|
+ foll_flags |= FOLL_COW;
|
||
|
|
||
|
cond_resched();
|
||
|
}
|
||
|
--
|
||
|
cgit v1.1
|
||
|
|