| // SPDX-License-Identifier: GPL-2.0 |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/mmap_lock.h> |
| |
| #include <linux/mm.h> |
| #include <linux/cgroup.h> |
| #include <linux/memcontrol.h> |
| #include <linux/mmap_lock.h> |
| #include <linux/mutex.h> |
| #include <linux/percpu.h> |
| #include <linux/rcupdate.h> |
| #include <linux/smp.h> |
| #include <linux/trace_events.h> |
| #include <linux/local_lock.h> |
| |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); |
| EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); |
| |
| #ifdef CONFIG_TRACING |
| /* |
| * Trace calls must be in a separate file, as otherwise there's a circular |
| * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. |
| */ |
| |
| void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) |
| { |
| trace_mmap_lock_start_locking(mm, write); |
| } |
| EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); |
| |
| void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, |
| bool success) |
| { |
| trace_mmap_lock_acquire_returned(mm, write, success); |
| } |
| EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); |
| |
| void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) |
| { |
| trace_mmap_lock_released(mm, write); |
| } |
| EXPORT_SYMBOL(__mmap_lock_do_trace_released); |
| #endif /* CONFIG_TRACING */ |
| |
| #ifdef CONFIG_MMU |
| #ifdef CONFIG_PER_VMA_LOCK |
| |
| /* State shared across __vma_[start, end]_exclude_readers. */ |
| struct vma_exclude_readers_state { |
| /* Input parameters. */ |
| struct vm_area_struct *vma; |
| int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ |
| bool detaching; |
| |
| /* Output parameters. */ |
| bool detached; |
| bool exclusive; /* Are we exclusively locked? */ |
| }; |
| |
| /* |
| * Now that all readers have been evicted, mark the VMA as being out of the |
| * 'exclude readers' state. |
| */ |
| static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) |
| { |
| struct vm_area_struct *vma = ves->vma; |
| |
| VM_WARN_ON_ONCE(ves->detached); |
| |
| ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, |
| &vma->vm_refcnt); |
| __vma_lockdep_release_exclusive(vma); |
| } |
| |
| static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) |
| { |
| const unsigned int tgt = ves->detaching ? 0 : 1; |
| |
| return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; |
| } |
| |
| /* |
| * Mark the VMA as being in a state of excluding readers, check to see if any |
| * VMA read locks are indeed held, and if so wait for them to be released. |
| * |
| * Note that this function pairs with vma_refcount_put() which will wake up this |
| * thread when it detects that the last reader has released its lock. |
| * |
| * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases |
| * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal |
| * signal is permitted to kill it. |
| * |
| * The function sets the ves->exclusive parameter to true if readers were |
| * excluded, or false if the VMA was detached or an error arose on wait. |
| * |
| * If the function indicates an exclusive lock was acquired via ves->exclusive |
| * the caller is required to invoke __vma_end_exclude_readers() once the |
| * exclusive state is no longer required. |
| * |
| * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the |
| * function may also return -EINTR to indicate a fatal signal was received while |
| * waiting. Otherwise, the function returns 0. |
| */ |
| static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) |
| { |
| struct vm_area_struct *vma = ves->vma; |
| unsigned int tgt_refcnt = get_target_refcnt(ves); |
| int err = 0; |
| |
| mmap_assert_write_locked(vma->vm_mm); |
| |
| /* |
| * If vma is detached then only vma_mark_attached() can raise the |
| * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). |
| * |
| * See the comment describing the vm_area_struct->vm_refcnt field for |
| * details of possible refcnt values. |
| */ |
| if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { |
| ves->detached = true; |
| return 0; |
| } |
| |
| __vma_lockdep_acquire_exclusive(vma); |
| err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, |
| refcount_read(&vma->vm_refcnt) == tgt_refcnt, |
| ves->state); |
| if (err) { |
| __vma_end_exclude_readers(ves); |
| return err; |
| } |
| |
| __vma_lockdep_stat_mark_acquired(vma); |
| ves->exclusive = true; |
| return 0; |
| } |
| |
| int __vma_start_write(struct vm_area_struct *vma, int state) |
| { |
| const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma); |
| struct vma_exclude_readers_state ves = { |
| .vma = vma, |
| .state = state, |
| }; |
| int err; |
| |
| err = __vma_start_exclude_readers(&ves); |
| if (err) { |
| WARN_ON_ONCE(ves.detached); |
| return err; |
| } |
| |
| /* |
| * We should use WRITE_ONCE() here because we can have concurrent reads |
| * from the early lockless pessimistic check in vma_start_read(). |
| * We don't really care about the correctness of that early check, but |
| * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. |
| */ |
| WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); |
| |
| if (ves.exclusive) { |
| __vma_end_exclude_readers(&ves); |
| /* VMA should remain attached. */ |
| WARN_ON_ONCE(ves.detached); |
| } |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(__vma_start_write); |
| |
| void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) |
| { |
| struct vma_exclude_readers_state ves = { |
| .vma = vma, |
| .state = TASK_UNINTERRUPTIBLE, |
| .detaching = true, |
| }; |
| int err; |
| |
| /* |
| * Wait until the VMA is detached with no readers. Since we hold the VMA |
| * write lock, the only read locks that might be present are those from |
| * threads trying to acquire the read lock and incrementing the |
| * reference count before realising the write lock is held and |
| * decrementing it. |
| */ |
| err = __vma_start_exclude_readers(&ves); |
| if (!err && ves.exclusive) { |
| /* |
| * Once this is complete, no readers can increment the |
| * reference count, and the VMA is marked detached. |
| */ |
| __vma_end_exclude_readers(&ves); |
| } |
| /* If an error arose but we were detached anyway, we don't care. */ |
| WARN_ON_ONCE(!ves.detached); |
| } |
| |
| /* |
| * Try to read-lock a vma. The function is allowed to occasionally yield false |
| * locked result to avoid performance overhead, in which case we fall back to |
| * using mmap_lock. The function should never yield false unlocked result. |
| * False locked result is possible if mm_lock_seq overflows or if vma gets |
| * reused and attached to a different mm before we lock it. |
| * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got |
| * detached. |
| * |
| * IMPORTANT: RCU lock must be held upon entering the function, but upon error |
| * IT IS RELEASED. The caller must handle this correctly. |
| */ |
| static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, |
| struct vm_area_struct *vma) |
| { |
| struct mm_struct *other_mm; |
| int oldcnt; |
| |
| RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); |
| /* |
| * Check before locking. A race might cause false locked result. |
| * We can use READ_ONCE() for the mm_lock_seq here, and don't need |
| * ACQUIRE semantics, because this is just a lockless check whose result |
| * we don't rely on for anything - the mm_lock_seq read against which we |
| * need ordering is below. |
| */ |
| if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { |
| vma = NULL; |
| goto err; |
| } |
| |
| /* |
| * If VM_REFCNT_EXCLUDE_READERS_FLAG is set, |
| * __refcount_inc_not_zero_limited_acquire() will fail because |
| * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG. |
| * |
| * Acquire fence is required here to avoid reordering against later |
| * vm_lock_seq check and checks inside lock_vma_under_rcu(). |
| */ |
| if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, |
| VM_REFCNT_LIMIT))) { |
| /* return EAGAIN if vma got detached from under us */ |
| vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); |
| goto err; |
| } |
| |
| __vma_lockdep_acquire_read(vma); |
| |
| if (unlikely(vma->vm_mm != mm)) |
| goto err_unstable; |
| |
| /* |
| * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. |
| * False unlocked result is impossible because we modify and check |
| * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq |
| * modification invalidates all existing locks. |
| * |
| * We must use ACQUIRE semantics for the mm_lock_seq so that if we are |
| * racing with vma_end_write_all(), we only start reading from the VMA |
| * after it has been unlocked. |
| * This pairs with RELEASE semantics in vma_end_write_all(). |
| */ |
| if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { |
| vma_refcount_put(vma); |
| vma = NULL; |
| goto err; |
| } |
| |
| return vma; |
| err: |
| rcu_read_unlock(); |
| |
| return vma; |
| err_unstable: |
| /* |
| * If vma got attached to another mm from under us, that mm is not |
| * stable and can be freed in the narrow window after vma->vm_refcnt |
| * is dropped and before rcuwait_wake_up(mm) is called. Grab it before |
| * releasing vma->vm_refcnt. |
| */ |
| other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ |
| |
| /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ |
| rcu_read_unlock(); |
| mmgrab(other_mm); |
| vma_refcount_put(vma); |
| mmdrop(other_mm); |
| |
| return NULL; |
| } |
| |
| /* |
| * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be |
| * stable and not isolated. If the VMA is not found or is being modified the |
| * function returns NULL. |
| */ |
| struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, |
| unsigned long address) |
| { |
| MA_STATE(mas, &mm->mm_mt, address, address); |
| struct vm_area_struct *vma; |
| |
| retry: |
| rcu_read_lock(); |
| vma = mas_walk(&mas); |
| if (!vma) { |
| rcu_read_unlock(); |
| goto inval; |
| } |
| |
| vma = vma_start_read(mm, vma); |
| if (IS_ERR_OR_NULL(vma)) { |
| /* Check if the VMA got isolated after we found it */ |
| if (PTR_ERR(vma) == -EAGAIN) { |
| count_vm_vma_lock_event(VMA_LOCK_MISS); |
| /* The area was replaced with another one */ |
| mas_set(&mas, address); |
| goto retry; |
| } |
| |
| /* Failed to lock the VMA */ |
| goto inval; |
| } |
| /* |
| * At this point, we have a stable reference to a VMA: The VMA is |
| * locked and we know it hasn't already been isolated. |
| * From here on, we can access the VMA without worrying about which |
| * fields are accessible for RCU readers. |
| */ |
| rcu_read_unlock(); |
| |
| /* Check if the vma we locked is the right one. */ |
| if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { |
| vma_end_read(vma); |
| goto inval; |
| } |
| |
| return vma; |
| |
| inval: |
| count_vm_vma_lock_event(VMA_LOCK_ABORT); |
| return NULL; |
| } |
| |
| static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, |
| struct vma_iterator *vmi, |
| unsigned long from_addr) |
| { |
| struct vm_area_struct *vma; |
| int ret; |
| |
| ret = mmap_read_lock_killable(mm); |
| if (ret) |
| return ERR_PTR(ret); |
| |
| /* Lookup the vma at the last position again under mmap_read_lock */ |
| vma_iter_set(vmi, from_addr); |
| vma = vma_next(vmi); |
| if (vma) { |
| /* Very unlikely vma->vm_refcnt overflow case */ |
| if (unlikely(!vma_start_read_locked(vma))) |
| vma = ERR_PTR(-EAGAIN); |
| } |
| |
| mmap_read_unlock(mm); |
| |
| return vma; |
| } |
| |
| struct vm_area_struct *lock_next_vma(struct mm_struct *mm, |
| struct vma_iterator *vmi, |
| unsigned long from_addr) |
| { |
| struct vm_area_struct *vma; |
| unsigned int mm_wr_seq; |
| bool mmap_unlocked; |
| |
| RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); |
| retry: |
| /* Start mmap_lock speculation in case we need to verify the vma later */ |
| mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); |
| vma = vma_next(vmi); |
| if (!vma) |
| return NULL; |
| |
| vma = vma_start_read(mm, vma); |
| if (IS_ERR_OR_NULL(vma)) { |
| /* |
| * Retry immediately if the vma gets detached from under us. |
| * Infinite loop should not happen because the vma we find will |
| * have to be constantly knocked out from under us. |
| */ |
| if (PTR_ERR(vma) == -EAGAIN) { |
| /* reset to search from the last address */ |
| rcu_read_lock(); |
| vma_iter_set(vmi, from_addr); |
| goto retry; |
| } |
| |
| goto fallback; |
| } |
| |
| /* Verify the vma is not behind the last search position. */ |
| if (unlikely(from_addr >= vma->vm_end)) |
| goto fallback_unlock; |
| |
| /* |
| * vma can be ahead of the last search position but we need to verify |
| * it was not shrunk after we found it and another vma has not been |
| * installed ahead of it. Otherwise we might observe a gap that should |
| * not be there. |
| */ |
| if (from_addr < vma->vm_start) { |
| /* Verify only if the address space might have changed since vma lookup. */ |
| if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { |
| vma_iter_set(vmi, from_addr); |
| if (vma != vma_next(vmi)) |
| goto fallback_unlock; |
| } |
| } |
| |
| return vma; |
| |
| fallback_unlock: |
| rcu_read_unlock(); |
| vma_end_read(vma); |
| fallback: |
| vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); |
| rcu_read_lock(); |
| /* Reinitialize the iterator after re-entering rcu read section */ |
| vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); |
| |
| return vma; |
| } |
| #endif /* CONFIG_PER_VMA_LOCK */ |
| |
| #ifdef CONFIG_LOCK_MM_AND_FIND_VMA |
| #include <linux/extable.h> |
| |
| static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) |
| { |
| if (likely(mmap_read_trylock(mm))) |
| return true; |
| |
| if (regs && !user_mode(regs)) { |
| unsigned long ip = exception_ip(regs); |
| if (!search_exception_tables(ip)) |
| return false; |
| } |
| |
| return !mmap_read_lock_killable(mm); |
| } |
| |
| static inline bool mmap_upgrade_trylock(struct mm_struct *mm) |
| { |
| /* |
| * We don't have this operation yet. |
| * |
| * It should be easy enough to do: it's basically a |
| * atomic_long_try_cmpxchg_acquire() |
| * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but |
| * it also needs the proper lockdep magic etc. |
| */ |
| return false; |
| } |
| |
| static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) |
| { |
| mmap_read_unlock(mm); |
| if (regs && !user_mode(regs)) { |
| unsigned long ip = exception_ip(regs); |
| if (!search_exception_tables(ip)) |
| return false; |
| } |
| return !mmap_write_lock_killable(mm); |
| } |
| |
| /* |
| * Helper for page fault handling. |
| * |
| * This is kind of equivalent to "mmap_read_lock()" followed |
| * by "find_extend_vma()", except it's a lot more careful about |
| * the locking (and will drop the lock on failure). |
| * |
| * For example, if we have a kernel bug that causes a page |
| * fault, we don't want to just use mmap_read_lock() to get |
| * the mm lock, because that would deadlock if the bug were |
| * to happen while we're holding the mm lock for writing. |
| * |
| * So this checks the exception tables on kernel faults in |
| * order to only do this all for instructions that are actually |
| * expected to fault. |
| * |
| * We can also actually take the mm lock for writing if we |
| * need to extend the vma, which helps the VM layer a lot. |
| */ |
| struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, |
| unsigned long addr, struct pt_regs *regs) |
| { |
| struct vm_area_struct *vma; |
| |
| if (!get_mmap_lock_carefully(mm, regs)) |
| return NULL; |
| |
| vma = find_vma(mm, addr); |
| if (likely(vma && (vma->vm_start <= addr))) |
| return vma; |
| |
| /* |
| * Well, dang. We might still be successful, but only |
| * if we can extend a vma to do so. |
| */ |
| if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { |
| mmap_read_unlock(mm); |
| return NULL; |
| } |
| |
| /* |
| * We can try to upgrade the mmap lock atomically, |
| * in which case we can continue to use the vma |
| * we already looked up. |
| * |
| * Otherwise we'll have to drop the mmap lock and |
| * re-take it, and also look up the vma again, |
| * re-checking it. |
| */ |
| if (!mmap_upgrade_trylock(mm)) { |
| if (!upgrade_mmap_lock_carefully(mm, regs)) |
| return NULL; |
| |
| vma = find_vma(mm, addr); |
| if (!vma) |
| goto fail; |
| if (vma->vm_start <= addr) |
| goto success; |
| if (!(vma->vm_flags & VM_GROWSDOWN)) |
| goto fail; |
| } |
| |
| if (expand_stack_locked(vma, addr)) |
| goto fail; |
| |
| success: |
| mmap_write_downgrade(mm); |
| return vma; |
| |
| fail: |
| mmap_write_unlock(mm); |
| return NULL; |
| } |
| #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ |
| |
| #else /* CONFIG_MMU */ |
| |
| /* |
| * At least xtensa ends up having protection faults even with no |
| * MMU.. No stack expansion, at least. |
| */ |
| struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, |
| unsigned long addr, struct pt_regs *regs) |
| { |
| struct vm_area_struct *vma; |
| |
| mmap_read_lock(mm); |
| vma = vma_lookup(mm, addr); |
| if (!vma) |
| mmap_read_unlock(mm); |
| return vma; |
| } |
| |
| #endif /* CONFIG_MMU */ |