@@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
773773 return 0 ;
774774}
775775
776- static inline void
776+ /*
777+ * Copy a present and normal page if necessary.
778+ *
779+ * NOTE! The usual case is that this doesn't need to do
780+ * anything, and can just return a positive value. That
781+ * will let the caller know that it can just increase
782+ * the page refcount and re-use the pte the traditional
783+ * way.
784+ *
785+ * But _if_ we need to copy it because it needs to be
786+ * pinned in the parent (and the child should get its own
787+ * copy rather than just a reference to the same page),
788+ * we'll do that here and return zero to let the caller
789+ * know we're done.
790+ *
791+ * And if we need a pre-allocated page but don't yet have
792+ * one, return a negative error to let the preallocation
793+ * code know so that it can do so outside the page table
794+ * lock.
795+ */
796+ static inline int
797+ copy_present_page (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
798+ pte_t * dst_pte , pte_t * src_pte ,
799+ struct vm_area_struct * vma , struct vm_area_struct * new ,
800+ unsigned long addr , int * rss , struct page * * prealloc ,
801+ pte_t pte , struct page * page )
802+ {
803+ struct page * new_page ;
804+
805+ if (!is_cow_mapping (vma -> vm_flags ))
806+ return 1 ;
807+
808+ /*
809+ * The trick starts.
810+ *
811+ * What we want to do is to check whether this page may
812+ * have been pinned by the parent process. If so,
813+ * instead of wrprotect the pte on both sides, we copy
814+ * the page immediately so that we'll always guarantee
815+ * the pinned page won't be randomly replaced in the
816+ * future.
817+ *
818+ * To achieve this, we do the following:
819+ *
820+ * 1. Write-protect the pte if it's writable. This is
821+ * to protect concurrent write fast-gup with
822+ * FOLL_PIN, so that we'll fail the fast-gup with
823+ * the write bit removed.
824+ *
825+ * 2. Check page_maybe_dma_pinned() to see whether this
826+ * page may have been pinned.
827+ *
828+ * The order of these steps is important to serialize
829+ * against the fast-gup code (gup_pte_range()) on the
830+ * pte check and try_grab_compound_head(), so that
831+ * we'll make sure either we'll capture that fast-gup
832+ * so we'll copy the pinned page here, or we'll fail
833+ * that fast-gup.
834+ *
835+ * NOTE! Even if we don't end up copying the page,
836+ * we won't undo this wrprotect(), because the normal
837+ * reference copy will need it anyway.
838+ */
839+ if (pte_write (pte ))
840+ ptep_set_wrprotect (src_mm , addr , src_pte );
841+
842+ /*
843+ * These are the "normally we can just copy by reference"
844+ * checks.
845+ */
846+ if (likely (!atomic_read (& src_mm -> has_pinned )))
847+ return 1 ;
848+ if (likely (!page_maybe_dma_pinned (page )))
849+ return 1 ;
850+
851+ /*
852+ * Uhhuh. It looks like the page might be a pinned page,
853+ * and we actually need to copy it. Now we can set the
854+ * source pte back to being writable.
855+ */
856+ if (pte_write (pte ))
857+ set_pte_at (src_mm , addr , src_pte , pte );
858+
859+ new_page = * prealloc ;
860+ if (!new_page )
861+ return - EAGAIN ;
862+
863+ /*
864+ * We have a prealloc page, all good! Take it
865+ * over and copy the page & arm it.
866+ */
867+ * prealloc = NULL ;
868+ copy_user_highpage (new_page , page , addr , vma );
869+ __SetPageUptodate (new_page );
870+ page_add_new_anon_rmap (new_page , new , addr , false);
871+ lru_cache_add_inactive_or_unevictable (new_page , new );
872+ rss [mm_counter (new_page )]++ ;
873+
874+ /* All done, just insert the new page copy in the child */
875+ pte = mk_pte (new_page , new -> vm_page_prot );
876+ pte = maybe_mkwrite (pte_mkdirty (pte ), new );
877+ set_pte_at (dst_mm , addr , dst_pte , pte );
878+ return 0 ;
879+ }
880+
881+ /*
882+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
883+ * is required to copy this pte.
884+ */
885+ static inline int
777886copy_present_pte (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
778887 pte_t * dst_pte , pte_t * src_pte , struct vm_area_struct * vma ,
779- unsigned long addr , int * rss )
888+ struct vm_area_struct * new ,
889+ unsigned long addr , int * rss , struct page * * prealloc )
780890{
781891 unsigned long vm_flags = vma -> vm_flags ;
782892 pte_t pte = * src_pte ;
783893 struct page * page ;
784894
895+ page = vm_normal_page (vma , addr , pte );
896+ if (page ) {
897+ int retval ;
898+
899+ retval = copy_present_page (dst_mm , src_mm ,
900+ dst_pte , src_pte ,
901+ vma , new ,
902+ addr , rss , prealloc ,
903+ pte , page );
904+ if (retval <= 0 )
905+ return retval ;
906+
907+ get_page (page );
908+ page_dup_rmap (page , false);
909+ rss [mm_counter (page )]++ ;
910+ }
911+
785912 /*
786913 * If it's a COW mapping, write protect it both
787914 * in the parent and the child
@@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807934 if (!(vm_flags & VM_UFFD_WP ))
808935 pte = pte_clear_uffd_wp (pte );
809936
810- page = vm_normal_page (vma , addr , pte );
811- if (page ) {
812- get_page (page );
813- page_dup_rmap (page , false);
814- rss [mm_counter (page )]++ ;
937+ set_pte_at (dst_mm , addr , dst_pte , pte );
938+ return 0 ;
939+ }
940+
941+ static inline struct page *
942+ page_copy_prealloc (struct mm_struct * src_mm , struct vm_area_struct * vma ,
943+ unsigned long addr )
944+ {
945+ struct page * new_page ;
946+
947+ new_page = alloc_page_vma (GFP_HIGHUSER_MOVABLE , vma , addr );
948+ if (!new_page )
949+ return NULL ;
950+
951+ if (mem_cgroup_charge (new_page , src_mm , GFP_KERNEL )) {
952+ put_page (new_page );
953+ return NULL ;
815954 }
955+ cgroup_throttle_swaprate (new_page , GFP_KERNEL );
816956
817- set_pte_at ( dst_mm , addr , dst_pte , pte ) ;
957+ return new_page ;
818958}
819959
820960static int copy_pte_range (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
@@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
825965 pte_t * orig_src_pte , * orig_dst_pte ;
826966 pte_t * src_pte , * dst_pte ;
827967 spinlock_t * src_ptl , * dst_ptl ;
828- int progress = 0 ;
968+ int progress , ret = 0 ;
829969 int rss [NR_MM_COUNTERS ];
830970 swp_entry_t entry = (swp_entry_t ){0 };
971+ struct page * prealloc = NULL ;
831972
832973again :
974+ progress = 0 ;
833975 init_rss_vec (rss );
834976
835977 dst_pte = pte_alloc_map_lock (dst_mm , dst_pmd , addr , & dst_ptl );
836- if (!dst_pte )
837- return - ENOMEM ;
978+ if (!dst_pte ) {
979+ ret = - ENOMEM ;
980+ goto out ;
981+ }
838982 src_pte = pte_offset_map (src_pmd , addr );
839983 src_ptl = pte_lockptr (src_mm , src_pmd );
840984 spin_lock_nested (src_ptl , SINGLE_DEPTH_NESTING );
@@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
8661010 progress += 8 ;
8671011 continue ;
8681012 }
869- copy_present_pte (dst_mm , src_mm , dst_pte , src_pte ,
870- vma , addr , rss );
1013+ /* copy_present_pte() will clear `*prealloc' if consumed */
1014+ ret = copy_present_pte (dst_mm , src_mm , dst_pte , src_pte ,
1015+ vma , new , addr , rss , & prealloc );
1016+ /*
1017+ * If we need a pre-allocated page for this pte, drop the
1018+ * locks, allocate, and try again.
1019+ */
1020+ if (unlikely (ret == - EAGAIN ))
1021+ break ;
1022+ if (unlikely (prealloc )) {
1023+ /*
1024+ * pre-alloc page cannot be reused by next time so as
1025+ * to strictly follow mempolicy (e.g., alloc_page_vma()
1026+ * will allocate page according to address). This
1027+ * could only happen if one pinned pte changed.
1028+ */
1029+ put_page (prealloc );
1030+ prealloc = NULL ;
1031+ }
8711032 progress += 8 ;
8721033 } while (dst_pte ++ , src_pte ++ , addr += PAGE_SIZE , addr != end );
8731034
@@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
8791040 cond_resched ();
8801041
8811042 if (entry .val ) {
882- if (add_swap_count_continuation (entry , GFP_KERNEL ) < 0 )
1043+ if (add_swap_count_continuation (entry , GFP_KERNEL ) < 0 ) {
1044+ ret = - ENOMEM ;
1045+ goto out ;
1046+ }
1047+ entry .val = 0 ;
1048+ } else if (ret ) {
1049+ WARN_ON_ONCE (ret != - EAGAIN );
1050+ prealloc = page_copy_prealloc (src_mm , vma , addr );
1051+ if (!prealloc )
8831052 return - ENOMEM ;
884- progress = 0 ;
1053+ /* We've captured and resolved the error. Reset, try again. */
1054+ ret = 0 ;
8851055 }
8861056 if (addr != end )
8871057 goto again ;
888- return 0 ;
1058+ out :
1059+ if (unlikely (prealloc ))
1060+ put_page (prealloc );
1061+ return ret ;
8891062}
8901063
8911064static inline int copy_pmd_range (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
0 commit comments