| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * linux/mm/page_alloc.c |
| * |
| * Manages the free list, the system allocates free pages here. |
| * Note that kmalloc() lives in slab.c |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * Swap reorganised 29.12.95, Stephen Tweedie |
| * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
| * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
| * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
| * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
| * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
| * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
| */ |
| |
| #include <linux/stddef.h> |
| #include <linux/mm.h> |
| #include <linux/highmem.h> |
| #include <linux/interrupt.h> |
| #include <linux/jiffies.h> |
| #include <linux/compiler.h> |
| #include <linux/kernel.h> |
| #include <linux/kasan.h> |
| #include <linux/kmsan.h> |
| #include <linux/module.h> |
| #include <linux/suspend.h> |
| #include <linux/ratelimit.h> |
| #include <linux/oom.h> |
| #include <linux/topology.h> |
| #include <linux/sysctl.h> |
| #include <linux/cpu.h> |
| #include <linux/cpuset.h> |
| #include <linux/pagevec.h> |
| #include <linux/memory_hotplug.h> |
| #include <linux/nodemask.h> |
| #include <linux/vmstat.h> |
| #include <linux/fault-inject.h> |
| #include <linux/compaction.h> |
| #include <trace/events/kmem.h> |
| #include <trace/events/oom.h> |
| #include <linux/prefetch.h> |
| #include <linux/mm_inline.h> |
| #include <linux/mmu_notifier.h> |
| #include <linux/migrate.h> |
| #include <linux/sched/mm.h> |
| #include <linux/page_owner.h> |
| #include <linux/page_table_check.h> |
| #include <linux/memcontrol.h> |
| #include <linux/ftrace.h> |
| #include <linux/lockdep.h> |
| #include <linux/psi.h> |
| #include <linux/khugepaged.h> |
| #include <linux/delayacct.h> |
| #include <linux/cacheinfo.h> |
| #include <linux/pgalloc_tag.h> |
| #include <asm/div64.h> |
| #include "internal.h" |
| #include "shuffle.h" |
| #include "page_reporting.h" |
| |
| /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ |
| typedef int __bitwise fpi_t; |
| |
| /* No special request */ |
| #define FPI_NONE ((__force fpi_t)0) |
| |
| /* |
| * Skip free page reporting notification for the (possibly merged) page. |
| * This does not hinder free page reporting from grabbing the page, |
| * reporting it and marking it "reported" - it only skips notifying |
| * the free page reporting infrastructure about a newly freed page. For |
| * example, used when temporarily pulling a page from a freelist and |
| * putting it back unmodified. |
| */ |
| #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) |
| |
| /* |
| * Place the (possibly merged) page to the tail of the freelist. Will ignore |
| * page shuffling (relevant code - e.g., memory onlining - is expected to |
| * shuffle the whole zone). |
| * |
| * Note: No code should rely on this flag for correctness - it's purely |
| * to allow for optimizations when handing back either fresh pages |
| * (memory onlining) or untouched pages (page isolation, free page |
| * reporting). |
| */ |
| #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) |
| |
| /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
| static DEFINE_MUTEX(pcp_batch_high_lock); |
| #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) |
| |
| #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) |
| /* |
| * On SMP, spin_trylock is sufficient protection. |
| * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. |
| */ |
| #define pcp_trylock_prepare(flags) do { } while (0) |
| #define pcp_trylock_finish(flag) do { } while (0) |
| #else |
| |
| /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ |
| #define pcp_trylock_prepare(flags) local_irq_save(flags) |
| #define pcp_trylock_finish(flags) local_irq_restore(flags) |
| #endif |
| |
| /* |
| * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid |
| * a migration causing the wrong PCP to be locked and remote memory being |
| * potentially allocated, pin the task to the CPU for the lookup+lock. |
| * preempt_disable is used on !RT because it is faster than migrate_disable. |
| * migrate_disable is used on RT because otherwise RT spinlock usage is |
| * interfered with and a high priority task cannot preempt the allocator. |
| */ |
| #ifndef CONFIG_PREEMPT_RT |
| #define pcpu_task_pin() preempt_disable() |
| #define pcpu_task_unpin() preempt_enable() |
| #else |
| #define pcpu_task_pin() migrate_disable() |
| #define pcpu_task_unpin() migrate_enable() |
| #endif |
| |
| /* |
| * Generic helper to lookup and a per-cpu variable with an embedded spinlock. |
| * Return value should be used with equivalent unlock helper. |
| */ |
| #define pcpu_spin_lock(type, member, ptr) \ |
| ({ \ |
| type *_ret; \ |
| pcpu_task_pin(); \ |
| _ret = this_cpu_ptr(ptr); \ |
| spin_lock(&_ret->member); \ |
| _ret; \ |
| }) |
| |
| #define pcpu_spin_trylock(type, member, ptr) \ |
| ({ \ |
| type *_ret; \ |
| pcpu_task_pin(); \ |
| _ret = this_cpu_ptr(ptr); \ |
| if (!spin_trylock(&_ret->member)) { \ |
| pcpu_task_unpin(); \ |
| _ret = NULL; \ |
| } \ |
| _ret; \ |
| }) |
| |
| #define pcpu_spin_unlock(member, ptr) \ |
| ({ \ |
| spin_unlock(&ptr->member); \ |
| pcpu_task_unpin(); \ |
| }) |
| |
| /* struct per_cpu_pages specific helpers. */ |
| #define pcp_spin_lock(ptr) \ |
| pcpu_spin_lock(struct per_cpu_pages, lock, ptr) |
| |
| #define pcp_spin_trylock(ptr) \ |
| pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) |
| |
| #define pcp_spin_unlock(ptr) \ |
| pcpu_spin_unlock(lock, ptr) |
| |
| #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
| DEFINE_PER_CPU(int, numa_node); |
| EXPORT_PER_CPU_SYMBOL(numa_node); |
| #endif |
| |
| DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); |
| |
| #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
| /* |
| * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
| * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. |
| * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() |
| * defined in <linux/topology.h>. |
| */ |
| DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
| EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
| #endif |
| |
| static DEFINE_MUTEX(pcpu_drain_mutex); |
| |
| #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
| volatile unsigned long latent_entropy __latent_entropy; |
| EXPORT_SYMBOL(latent_entropy); |
| #endif |
| |
| /* |
| * Array of node states. |
| */ |
| nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
| [N_POSSIBLE] = NODE_MASK_ALL, |
| [N_ONLINE] = { { [0] = 1UL } }, |
| #ifndef CONFIG_NUMA |
| [N_NORMAL_MEMORY] = { { [0] = 1UL } }, |
| #ifdef CONFIG_HIGHMEM |
| [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
| #endif |
| [N_MEMORY] = { { [0] = 1UL } }, |
| [N_CPU] = { { [0] = 1UL } }, |
| #endif /* NUMA */ |
| }; |
| EXPORT_SYMBOL(node_states); |
| |
| gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| |
| #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| unsigned int pageblock_order __read_mostly; |
| #endif |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags); |
| |
| /* |
| * results with 256, 32 in the lowmem_reserve sysctl: |
| * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
| * 1G machine -> (16M dma, 784M normal, 224M high) |
| * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
| * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
| * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
| * |
| * TBD: should special case ZONE_DMA32 machines here - in those we normally |
| * don't need any ZONE_NORMAL reservation |
| */ |
| static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| [ZONE_DMA] = 256, |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| [ZONE_DMA32] = 256, |
| #endif |
| [ZONE_NORMAL] = 32, |
| #ifdef CONFIG_HIGHMEM |
| [ZONE_HIGHMEM] = 0, |
| #endif |
| [ZONE_MOVABLE] = 0, |
| }; |
| |
| char * const zone_names[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| "DMA", |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| "DMA32", |
| #endif |
| "Normal", |
| #ifdef CONFIG_HIGHMEM |
| "HighMem", |
| #endif |
| "Movable", |
| #ifdef CONFIG_ZONE_DEVICE |
| "Device", |
| #endif |
| }; |
| |
| const char * const migratetype_names[MIGRATE_TYPES] = { |
| "Unmovable", |
| "Movable", |
| "Reclaimable", |
| "HighAtomic", |
| #ifdef CONFIG_CMA |
| "CMA", |
| #endif |
| #ifdef CONFIG_MEMORY_ISOLATION |
| "Isolate", |
| #endif |
| }; |
| |
| int min_free_kbytes = 1024; |
| int user_min_free_kbytes = -1; |
| static int watermark_boost_factor __read_mostly = 15000; |
| static int watermark_scale_factor = 10; |
| |
| /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| int movable_zone; |
| EXPORT_SYMBOL(movable_zone); |
| |
| #if MAX_NUMNODES > 1 |
| unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
| unsigned int nr_online_nodes __read_mostly = 1; |
| EXPORT_SYMBOL(nr_node_ids); |
| EXPORT_SYMBOL(nr_online_nodes); |
| #endif |
| |
| static bool page_contains_unaccepted(struct page *page, unsigned int order); |
| static void accept_page(struct page *page, unsigned int order); |
| static bool try_to_accept_memory(struct zone *zone, unsigned int order); |
| static inline bool has_unaccepted_memory(void); |
| static bool __free_unaccepted(struct page *page); |
| |
| int page_group_by_mobility_disabled __read_mostly; |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| /* |
| * During boot we initialize deferred pages on-demand, as needed, but once |
| * page_alloc_init_late() has finished, the deferred pages are all initialized, |
| * and we can permanently disable that path. |
| */ |
| DEFINE_STATIC_KEY_TRUE(deferred_pages); |
| |
| static inline bool deferred_pages_enabled(void) |
| { |
| return static_branch_unlikely(&deferred_pages); |
| } |
| |
| /* |
| * deferred_grow_zone() is __init, but it is called from |
| * get_page_from_freelist() during early boot until deferred_pages permanently |
| * disables this call. This is why we have refdata wrapper to avoid warning, |
| * and to ensure that the function body gets unloaded. |
| */ |
| static bool __ref |
| _deferred_grow_zone(struct zone *zone, unsigned int order) |
| { |
| return deferred_grow_zone(zone, order); |
| } |
| #else |
| static inline bool deferred_pages_enabled(void) |
| { |
| return false; |
| } |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
| static inline unsigned long *get_pageblock_bitmap(const struct page *page, |
| unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| return section_to_usemap(__pfn_to_section(pfn)); |
| #else |
| return page_zone(page)->pageblock_flags; |
| #endif /* CONFIG_SPARSEMEM */ |
| } |
| |
| static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| pfn &= (PAGES_PER_SECTION-1); |
| #else |
| pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); |
| #endif /* CONFIG_SPARSEMEM */ |
| return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
| } |
| |
| /** |
| * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| * |
| * Return: pageblock_bits flags |
| */ |
| unsigned long get_pfnblock_flags_mask(const struct page *page, |
| unsigned long pfn, unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long word; |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| /* |
| * This races, without locks, with set_pfnblock_flags_mask(). Ensure |
| * a consistent read of the memory array, so that results, even though |
| * racy, are not corrupted. |
| */ |
| word = READ_ONCE(bitmap[word_bitidx]); |
| return (word >> bitidx) & mask; |
| } |
| |
| static __always_inline int get_pfnblock_migratetype(const struct page *page, |
| unsigned long pfn) |
| { |
| return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); |
| } |
| |
| /** |
| * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @flags: The flags to set |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| */ |
| void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
| unsigned long pfn, |
| unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long word; |
| |
| BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
| BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| |
| VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); |
| |
| mask <<= bitidx; |
| flags <<= bitidx; |
| |
| word = READ_ONCE(bitmap[word_bitidx]); |
| do { |
| } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); |
| } |
| |
| void set_pageblock_migratetype(struct page *page, int migratetype) |
| { |
| if (unlikely(page_group_by_mobility_disabled && |
| migratetype < MIGRATE_PCPTYPES)) |
| migratetype = MIGRATE_UNMOVABLE; |
| |
| set_pfnblock_flags_mask(page, (unsigned long)migratetype, |
| page_to_pfn(page), MIGRATETYPE_MASK); |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| { |
| int ret; |
| unsigned seq; |
| unsigned long pfn = page_to_pfn(page); |
| unsigned long sp, start_pfn; |
| |
| do { |
| seq = zone_span_seqbegin(zone); |
| start_pfn = zone->zone_start_pfn; |
| sp = zone->spanned_pages; |
| ret = !zone_spans_pfn(zone, pfn); |
| } while (zone_span_seqretry(zone, seq)); |
| |
| if (ret) |
| pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", |
| pfn, zone_to_nid(zone), zone->name, |
| start_pfn, start_pfn + sp); |
| |
| return ret; |
| } |
| |
| /* |
| * Temporary debugging check for pages not lying within a given zone. |
| */ |
| static bool __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| if (page_outside_zone_boundaries(zone, page)) |
| return true; |
| if (zone != page_zone(page)) |
| return true; |
| |
| return false; |
| } |
| #else |
| static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| return false; |
| } |
| #endif |
| |
| static void bad_page(struct page *page, const char *reason) |
| { |
| static unsigned long resume; |
| static unsigned long nr_shown; |
| static unsigned long nr_unshown; |
| |
| /* |
| * Allow a burst of 60 reports, then keep quiet for that minute; |
| * or allow a steady drip of one report per second. |
| */ |
| if (nr_shown == 60) { |
| if (time_before(jiffies, resume)) { |
| nr_unshown++; |
| goto out; |
| } |
| if (nr_unshown) { |
| pr_alert( |
| "BUG: Bad page state: %lu messages suppressed\n", |
| nr_unshown); |
| nr_unshown = 0; |
| } |
| nr_shown = 0; |
| } |
| if (nr_shown++ == 0) |
| resume = jiffies + 60 * HZ; |
| |
| pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
| current->comm, page_to_pfn(page)); |
| dump_page(page, reason); |
| |
| print_modules(); |
| dump_stack(); |
| out: |
| /* Leave bad fields for debug, except PageBuddy could make trouble */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
| } |
| |
| static inline unsigned int order_to_pindex(int migratetype, int order) |
| { |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (order > PAGE_ALLOC_COSTLY_ORDER) { |
| VM_BUG_ON(order != HPAGE_PMD_ORDER); |
| return NR_LOWORDER_PCP_LISTS; |
| } |
| #else |
| VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); |
| #endif |
| |
| return (MIGRATE_PCPTYPES * order) + migratetype; |
| } |
| |
| static inline int pindex_to_order(unsigned int pindex) |
| { |
| int order = pindex / MIGRATE_PCPTYPES; |
| |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (pindex == NR_LOWORDER_PCP_LISTS) |
| order = HPAGE_PMD_ORDER; |
| #else |
| VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); |
| #endif |
| |
| return order; |
| } |
| |
| static inline bool pcp_allowed_order(unsigned int order) |
| { |
| if (order <= PAGE_ALLOC_COSTLY_ORDER) |
| return true; |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (order == HPAGE_PMD_ORDER) |
| return true; |
| #endif |
| return false; |
| } |
| |
| /* |
| * Higher-order pages are called "compound pages". They are structured thusly: |
| * |
| * The first PAGE_SIZE page is called the "head page" and have PG_head set. |
| * |
| * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded |
| * in bit 0 of page->compound_head. The rest of bits is pointer to head page. |
| * |
| * The first tail page's ->compound_order holds the order of allocation. |
| * This usage means that zero-order pages may not be compound. |
| */ |
| |
| void prep_compound_page(struct page *page, unsigned int order) |
| { |
| int i; |
| int nr_pages = 1 << order; |
| |
| __SetPageHead(page); |
| for (i = 1; i < nr_pages; i++) |
| prep_compound_tail(page, i); |
| |
| prep_compound_head(page, order); |
| } |
| |
| static inline void set_buddy_order(struct page *page, unsigned int order) |
| { |
| set_page_private(page, order); |
| __SetPageBuddy(page); |
| } |
| |
| #ifdef CONFIG_COMPACTION |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| struct capture_control *capc = current->capture_control; |
| |
| return unlikely(capc) && |
| !(current->flags & PF_KTHREAD) && |
| !capc->page && |
| capc->cc->zone == zone ? capc : NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| if (!capc || order != capc->cc->order) |
| return false; |
| |
| /* Do not accidentally pollute CMA or isolated regions*/ |
| if (is_migrate_cma(migratetype) || |
| is_migrate_isolate(migratetype)) |
| return false; |
| |
| /* |
| * Do not let lower order allocations pollute a movable pageblock |
| * unless compaction is also requesting movable pages. |
| * This might let an unmovable request use a reclaimable pageblock |
| * and vice-versa but no more than normal fallback logic which can |
| * have trouble finding a high-order free page. |
| */ |
| if (order < pageblock_order && migratetype == MIGRATE_MOVABLE && |
| capc->cc->migratetype != MIGRATE_MOVABLE) |
| return false; |
| |
| capc->page = page; |
| return true; |
| } |
| |
| #else |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| return NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| return false; |
| } |
| #endif /* CONFIG_COMPACTION */ |
| |
| static inline void account_freepages(struct zone *zone, int nr_pages, |
| int migratetype) |
| { |
| if (is_migrate_isolate(migratetype)) |
| return; |
| |
| __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); |
| |
| if (is_migrate_cma(migratetype)) |
| __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); |
| } |
| |
| /* Used for pages not on another list */ |
| static inline void __add_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype, |
| bool tail) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, |
| "page type is %lu, passed migratetype is %d (nr=%d)\n", |
| get_pageblock_migratetype(page), migratetype, 1 << order); |
| |
| if (tail) |
| list_add_tail(&page->buddy_list, &area->free_list[migratetype]); |
| else |
| list_add(&page->buddy_list, &area->free_list[migratetype]); |
| area->nr_free++; |
| } |
| |
| /* |
| * Used for pages which are on another list. Move the pages to the tail |
| * of the list - so the moved pages won't immediately be considered for |
| * allocation again (e.g., optimization for memory onlining). |
| */ |
| static inline void move_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int old_mt, int new_mt) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| /* Free page moving can fail, so it happens before the type update */ |
| VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, |
| "page type is %lu, passed migratetype is %d (nr=%d)\n", |
| get_pageblock_migratetype(page), old_mt, 1 << order); |
| |
| list_move_tail(&page->buddy_list, &area->free_list[new_mt]); |
| |
| account_freepages(zone, -(1 << order), old_mt); |
| account_freepages(zone, 1 << order, new_mt); |
| } |
| |
| static inline void __del_page_from_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, |
| "page type is %lu, passed migratetype is %d (nr=%d)\n", |
| get_pageblock_migratetype(page), migratetype, 1 << order); |
| |
| /* clear reported state and update reported page count */ |
| if (page_reported(page)) |
| __ClearPageReported(page); |
| |
| list_del(&page->buddy_list); |
| __ClearPageBuddy(page); |
| set_page_private(page, 0); |
| zone->free_area[order].nr_free--; |
| } |
| |
| static inline void del_page_from_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| __del_page_from_free_list(page, zone, order, migratetype); |
| account_freepages(zone, -(1 << order), migratetype); |
| } |
| |
| static inline struct page *get_page_from_free_area(struct free_area *area, |
| int migratetype) |
| { |
| return list_first_entry_or_null(&area->free_list[migratetype], |
| struct page, buddy_list); |
| } |
| |
| /* |
| * If this is not the largest possible page, check if the buddy |
| * of the next-highest order is free. If it is, it's possible |
| * that pages are being freed that will coalesce soon. In case, |
| * that is happening, add the free page to the tail of the list |
| * so it's less likely to be used soon and more likely to be merged |
| * as a higher order page |
| */ |
| static inline bool |
| buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, |
| struct page *page, unsigned int order) |
| { |
| unsigned long higher_page_pfn; |
| struct page *higher_page; |
| |
| if (order >= MAX_PAGE_ORDER - 1) |
| return false; |
| |
| higher_page_pfn = buddy_pfn & pfn; |
| higher_page = page + (higher_page_pfn - pfn); |
| |
| return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, |
| NULL) != NULL; |
| } |
| |
| /* |
| * Freeing function for a buddy system allocator. |
| * |
| * The concept of a buddy system is to maintain direct-mapped table |
| * (containing bit values) for memory blocks of various "orders". |
| * The bottom level table contains the map for the smallest allocatable |
| * units of memory (here, pages), and each level above it describes |
| * pairs of units from the levels below, hence, "buddies". |
| * At a high level, all that happens here is marking the table entry |
| * at the bottom level available, and propagating the changes upward |
| * as necessary, plus some accounting needed to play nicely with other |
| * parts of the VM system. |
| * At each level, we keep a list of pages, which are heads of continuous |
| * free pages of length of (1 << order) and marked with PageBuddy. |
| * Page's order is recorded in page_private(page) field. |
| * So when we are allocating or freeing one, we can derive the state of the |
| * other. That is, if we allocate a small block, and both were |
| * free, the remainder of the region must be split into blocks. |
| * If a block is freed, and its buddy is also free, then this |
| * triggers coalescing into a block of larger size. |
| * |
| * -- nyc |
| */ |
| |
| static inline void __free_one_page(struct page *page, |
| unsigned long pfn, |
| struct zone *zone, unsigned int order, |
| int migratetype, fpi_t fpi_flags) |
| { |
| struct capture_control *capc = task_capc(zone); |
| unsigned long buddy_pfn = 0; |
| unsigned long combined_pfn; |
| struct page *buddy; |
| bool to_tail; |
| |
| VM_BUG_ON(!zone_is_initialized(zone)); |
| VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
| |
| VM_BUG_ON(migratetype == -1); |
| VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); |
| VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| |
| account_freepages(zone, 1 << order, migratetype); |
| |
| while (order < MAX_PAGE_ORDER) { |
| int buddy_mt = migratetype; |
| |
| if (compaction_capture(capc, page, order, migratetype)) { |
| account_freepages(zone, -(1 << order), migratetype); |
| return; |
| } |
| |
| buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); |
| if (!buddy) |
| goto done_merging; |
| |
| if (unlikely(order >= pageblock_order)) { |
| /* |
| * We want to prevent merge between freepages on pageblock |
| * without fallbacks and normal pageblock. Without this, |
| * pageblock isolation could cause incorrect freepage or CMA |
| * accounting or HIGHATOMIC accounting. |
| */ |
| buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); |
| |
| if (migratetype != buddy_mt && |
| (!migratetype_is_mergeable(migratetype) || |
| !migratetype_is_mergeable(buddy_mt))) |
| goto done_merging; |
| } |
| |
| /* |
| * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
| * merge with it and move up one order. |
| */ |
| if (page_is_guard(buddy)) |
| clear_page_guard(zone, buddy, order); |
| else |
| __del_page_from_free_list(buddy, zone, order, buddy_mt); |
| |
| if (unlikely(buddy_mt != migratetype)) { |
| /* |
| * Match buddy type. This ensures that an |
| * expand() down the line puts the sub-blocks |
| * on the right freelists. |
| */ |
| set_pageblock_migratetype(buddy, migratetype); |
| } |
| |
| combined_pfn = buddy_pfn & pfn; |
| page = page + (combined_pfn - pfn); |
| pfn = combined_pfn; |
| order++; |
| } |
| |
| done_merging: |
| set_buddy_order(page, order); |
| |
| if (fpi_flags & FPI_TO_TAIL) |
| to_tail = true; |
| else if (is_shuffle_order(order)) |
| to_tail = shuffle_pick_tail(); |
| else |
| to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); |
| |
| __add_to_free_list(page, zone, order, migratetype, to_tail); |
| |
| /* Notify page reporting subsystem of freed page */ |
| if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) |
| page_reporting_notify_free(order); |
| } |
| |
| /* |
| * A bad page could be due to a number of fields. Instead of multiple branches, |
| * try and check multiple fields with one check. The caller must do a detailed |
| * check if necessary. |
| */ |
| static inline bool page_expected_state(struct page *page, |
| unsigned long check_flags) |
| { |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| return false; |
| |
| if (unlikely((unsigned long)page->mapping | |
| page_ref_count(page) | |
| #ifdef CONFIG_MEMCG |
| page->memcg_data | |
| #endif |
| #ifdef CONFIG_PAGE_POOL |
| ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | |
| #endif |
| (page->flags & check_flags))) |
| return false; |
| |
| return true; |
| } |
| |
| static const char *page_bad_reason(struct page *page, unsigned long flags) |
| { |
| const char *bad_reason = NULL; |
| |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| bad_reason = "nonzero mapcount"; |
| if (unlikely(page->mapping != NULL)) |
| bad_reason = "non-NULL mapping"; |
| if (unlikely(page_ref_count(page) != 0)) |
| bad_reason = "nonzero _refcount"; |
| if (unlikely(page->flags & flags)) { |
| if (flags == PAGE_FLAGS_CHECK_AT_PREP) |
| bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; |
| else |
| bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
| } |
| #ifdef CONFIG_MEMCG |
| if (unlikely(page->memcg_data)) |
| bad_reason = "page still charged to cgroup"; |
| #endif |
| #ifdef CONFIG_PAGE_POOL |
| if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) |
| bad_reason = "page_pool leak"; |
| #endif |
| return bad_reason; |
| } |
| |
| static void free_page_is_bad_report(struct page *page) |
| { |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); |
| } |
| |
| static inline bool free_page_is_bad(struct page *page) |
| { |
| if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) |
| return false; |
| |
| /* Something has gone sideways, find it */ |
| free_page_is_bad_report(page); |
| return true; |
| } |
| |
| static inline bool is_check_pages_enabled(void) |
| { |
| return static_branch_unlikely(&check_pages_enabled); |
| } |
| |
| static int free_tail_page_prepare(struct page *head_page, struct page *page) |
| { |
| struct folio *folio = (struct folio *)head_page; |
| int ret = 1; |
| |
| /* |
| * We rely page->lru.next never has bit 0 set, unless the page |
| * is PageTail(). Let's make sure that's true even for poisoned ->lru. |
| */ |
| BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); |
| |
| if (!is_check_pages_enabled()) { |
| ret = 0; |
| goto out; |
| } |
| switch (page - head_page) { |
| case 1: |
| /* the first tail page: these may be in place of ->mapping */ |
| if (unlikely(folio_entire_mapcount(folio))) { |
| bad_page(page, "nonzero entire_mapcount"); |
| goto out; |
| } |
| if (unlikely(folio_large_mapcount(folio))) { |
| bad_page(page, "nonzero large_mapcount"); |
| goto out; |
| } |
| if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { |
| bad_page(page, "nonzero nr_pages_mapped"); |
| goto out; |
| } |
| if (unlikely(atomic_read(&folio->_pincount))) { |
| bad_page(page, "nonzero pincount"); |
| goto out; |
| } |
| break; |
| case 2: |
| /* the second tail page: deferred_list overlaps ->mapping */ |
| if (unlikely(!list_empty(&folio->_deferred_list))) { |
| bad_page(page, "on deferred list"); |
| goto out; |
| } |
| break; |
| default: |
| if (page->mapping != TAIL_MAPPING) { |
| bad_page(page, "corrupted mapping in tail page"); |
| goto out; |
| } |
| break; |
| } |
| if (unlikely(!PageTail(page))) { |
| bad_page(page, "PageTail not set"); |
| goto out; |
| } |
| if (unlikely(compound_head(page) != head_page)) { |
| bad_page(page, "compound_head not consistent"); |
| goto out; |
| } |
| ret = 0; |
| out: |
| page->mapping = NULL; |
| clear_compound_head(page); |
| return ret; |
| } |
| |
| /* |
| * Skip KASAN memory poisoning when either: |
| * |
| * 1. For generic KASAN: deferred memory initialization has not yet completed. |
| * Tag-based KASAN modes skip pages freed via deferred memory initialization |
| * using page tags instead (see below). |
| * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating |
| * that error detection is disabled for accesses via the page address. |
| * |
| * Pages will have match-all tags in the following circumstances: |
| * |
| * 1. Pages are being initialized for the first time, including during deferred |
| * memory init; see the call to page_kasan_tag_reset in __init_single_page. |
| * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the |
| * exception of pages unpoisoned by kasan_unpoison_vmalloc. |
| * 3. The allocation was excluded from being checked due to sampling, |
| * see the call to kasan_unpoison_pages. |
| * |
| * Poisoning pages during deferred memory init will greatly lengthen the |
| * process and cause problem in large memory systems as the deferred pages |
| * initialization is done with interrupt disabled. |
| * |
| * Assuming that there will be no reference to those newly initialized |
| * pages before they are ever allocated, this should have no effect on |
| * KASAN memory tracking as the poison will be properly inserted at page |
| * allocation time. The only corner case is when pages are allocated by |
| * on-demand allocation and then freed again before the deferred pages |
| * initialization is done, but this is not likely to happen. |
| */ |
| static inline bool should_skip_kasan_poison(struct page *page) |
| { |
| if (IS_ENABLED(CONFIG_KASAN_GENERIC)) |
| return deferred_pages_enabled(); |
| |
| return page_kasan_tag(page) == KASAN_TAG_KERNEL; |
| } |
| |
| void kernel_init_pages(struct page *page, int numpages) |
| { |
| int i; |
| |
| /* s390's use of memset() could override KASAN redzones. */ |
| kasan_disable_current(); |
| for (i = 0; i < numpages; i++) |
| clear_highpage_kasan_tagged(page + i); |
| kasan_enable_current(); |
| } |
| |
| __always_inline bool free_pages_prepare(struct page *page, |
| unsigned int order) |
| { |
| int bad = 0; |
| bool skip_kasan_poison = should_skip_kasan_poison(page); |
| bool init = want_init_on_free(); |
| bool compound = PageCompound(page); |
| |
| VM_BUG_ON_PAGE(PageTail(page), page); |
| |
| trace_mm_page_free(page, order); |
| kmsan_free_page(page, order); |
| |
| if (memcg_kmem_online() && PageMemcgKmem(page)) |
| __memcg_kmem_uncharge_page(page, order); |
| |
| if (unlikely(PageHWPoison(page)) && !order) { |
| /* Do not let hwpoison pages hit pcplists/buddy */ |
| reset_page_owner(page, order); |
| page_table_check_free(page, order); |
| pgalloc_tag_sub(page, 1 << order); |
| return false; |
| } |
| |
| VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
| |
| /* |
| * Check tail pages before head page information is cleared to |
| * avoid checking PageCompound for order-0 pages. |
| */ |
| if (unlikely(order)) { |
| int i; |
| |
| if (compound) |
| page[1].flags &= ~PAGE_FLAGS_SECOND; |
| for (i = 1; i < (1 << order); i++) { |
| if (compound) |
| bad += free_tail_page_prepare(page, page + i); |
| if (is_check_pages_enabled()) { |
| if (free_page_is_bad(page + i)) { |
| bad++; |
| continue; |
| } |
| } |
| (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| } |
| } |
| if (PageMappingFlags(page)) |
| page->mapping = NULL; |
| if (is_check_pages_enabled()) { |
| if (free_page_is_bad(page)) |
| bad++; |
| if (bad) |
| return false; |
| } |
| |
| page_cpupid_reset_last(page); |
| page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| reset_page_owner(page, order); |
| page_table_check_free(page, order); |
| pgalloc_tag_sub(page, 1 << order); |
| |
| if (!PageHighMem(page)) { |
| debug_check_no_locks_freed(page_address(page), |
| PAGE_SIZE << order); |
| debug_check_no_obj_freed(page_address(page), |
| PAGE_SIZE << order); |
| } |
| |
| kernel_poison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN poisoning and memory initialization code must be |
| * kept together to avoid discrepancies in behavior. |
| * |
| * With hardware tag-based KASAN, memory tags must be set before the |
| * page becomes unavailable via debug_pagealloc or arch_free_page. |
| */ |
| if (!skip_kasan_poison) { |
| kasan_poison_pages(page, order, init); |
| |
| /* Memory is already initialized if KASAN did it internally. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } |
| if (init) |
| kernel_init_pages(page, 1 << order); |
| |
| /* |
| * arch_free_page() can make the page's contents inaccessible. s390 |
| * does this. So nothing which can access the page's contents should |
| * happen after this. |
| */ |
| arch_free_page(page, order); |
| |
| debug_pagealloc_unmap_pages(page, 1 << order); |
| |
| return true; |
| } |
| |
| /* |
| * Frees a number of pages from the PCP lists |
| * Assumes all pages on list are in same zone. |
| * count is the number of pages to free. |
| */ |
| static void free_pcppages_bulk(struct zone *zone, int count, |
| struct per_cpu_pages *pcp, |
| int pindex) |
| { |
| unsigned long flags; |
| unsigned int order; |
| struct page *page; |
| |
| /* |
| * Ensure proper count is passed which otherwise would stuck in the |
| * below while (list_empty(list)) loop. |
| */ |
| count = min(pcp->count, count); |
| |
| /* Ensure requested pindex is drained first. */ |
| pindex = pindex - 1; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| while (count > 0) { |
| struct list_head *list; |
| int nr_pages; |
| |
| /* Remove pages from lists in a round-robin fashion. */ |
| do { |
| if (++pindex > NR_PCP_LISTS - 1) |
| pindex = 0; |
| list = &pcp->lists[pindex]; |
| } while (list_empty(list)); |
| |
| order = pindex_to_order(pindex); |
| nr_pages = 1 << order; |
| do { |
| unsigned long pfn; |
| int mt; |
| |
| page = list_last_entry(list, struct page, pcp_list); |
| pfn = page_to_pfn(page); |
| mt = get_pfnblock_migratetype(page, pfn); |
| |
| /* must delete to avoid corrupting pcp list */ |
| list_del(&page->pcp_list); |
| count -= nr_pages; |
| pcp->count -= nr_pages; |
| |
| __free_one_page(page, pfn, zone, order, mt, FPI_NONE); |
| trace_mm_page_pcpu_drain(page, order, mt); |
| } while (count > 0 && !list_empty(list)); |
| } |
| |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| static void free_one_page(struct zone *zone, struct page *page, |
| unsigned long pfn, unsigned int order, |
| fpi_t fpi_flags) |
| { |
| unsigned long flags; |
| int migratetype; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags) |
| { |
| unsigned long pfn = page_to_pfn(page); |
| struct zone *zone = page_zone(page); |
| |
| if (!free_pages_prepare(page, order)) |
| return; |
| |
| free_one_page(zone, page, pfn, order, fpi_flags); |
| |
| __count_vm_events(PGFREE, 1 << order); |
| } |
| |
| void __free_pages_core(struct page *page, unsigned int order) |
| { |
| unsigned int nr_pages = 1 << order; |
| struct page *p = page; |
| unsigned int loop; |
| |
| /* |
| * When initializing the memmap, __init_single_page() sets the refcount |
| * of all pages to 1 ("allocated"/"not free"). We have to set the |
| * refcount of all involved pages to 0. |
| */ |
| prefetchw(p); |
| for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
| prefetchw(p + 1); |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| } |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| |
| atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
| |
| if (page_contains_unaccepted(page, order)) { |
| if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) |
| return; |
| |
| accept_page(page, order); |
| } |
| |
| /* |
| * Bypass PCP and place fresh pages right to the tail, primarily |
| * relevant for memory onlining. |
| */ |
| __free_pages_ok(page, order, FPI_TO_TAIL); |
| } |
| |
| /* |
| * Check that the whole (or subset of) a pageblock given by the interval of |
| * [start_pfn, end_pfn) is valid and within the same zone, before scanning it |
| * with the migration of free compaction scanner. |
| * |
| * Return struct page pointer of start_pfn, or NULL if checks were not passed. |
| * |
| * It's possible on some configurations to have a setup like node0 node1 node0 |
| * i.e. it's possible that all pages within a zones range of pages do not |
| * belong to a single zone. We assume that a border between node0 and node1 |
| * can occur within a single pageblock, but not a node0 node1 node0 |
| * interleaving within a single pageblock. It is therefore sufficient to check |
| * the first and last page of a pageblock and avoid checking each individual |
| * page in a pageblock. |
| * |
| * Note: the function may return non-NULL struct page even for a page block |
| * which contains a memory hole (i.e. there is no physical memory for a subset |
| * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which |
| * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole |
| * even though the start pfn is online and valid. This should be safe most of |
| * the time because struct pages are still initialized via init_unavailable_range() |
| * and pfn walkers shouldn't touch any physical memory range for which they do |
| * not recognize any specific metadata in struct pages. |
| */ |
| struct page *__pageblock_pfn_to_page(unsigned long start_pfn, |
| unsigned long end_pfn, struct zone *zone) |
| { |
| struct page *start_page; |
| struct page *end_page; |
| |
| /* end_pfn is one past the range we are checking */ |
| end_pfn--; |
| |
| if (!pfn_valid(end_pfn)) |
| return NULL; |
| |
| start_page = pfn_to_online_page(start_pfn); |
| if (!start_page) |
| return NULL; |
| |
| if (page_zone(start_page) != zone) |
| return NULL; |
| |
| end_page = pfn_to_page(end_pfn); |
| |
| /* This gives a shorter code than deriving page_zone(end_page) */ |
| if (page_zone_id(start_page) != page_zone_id(end_page)) |
| return NULL; |
| |
| return start_page; |
| } |
| |
| /* |
| * The order of subdivision here is critical for the IO subsystem. |
| * Please do not alter this order without good reasons and regression |
| * testing. Specifically, as large blocks of memory are subdivided, |
| * the order in which smaller blocks are delivered depends on the order |
| * they're subdivided in this function. This is the primary factor |
| * influencing the order in which pages are delivered to the IO |
| * subsystem according to empirical testing, and this is also justified |
| * by considering the behavior of a buddy system containing a single |
| * large block of memory acted on by a series of small allocations. |
| * This behavior is a critical factor in sglist merging's success. |
| * |
| * -- nyc |
| */ |
| static inline void expand(struct zone *zone, struct page *page, |
| int low, int high, int migratetype) |
| { |
| unsigned long size = 1 << high; |
| unsigned long nr_added = 0; |
| |
| while (high > low) { |
| high--; |
| size >>= 1; |
| VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
| |
| /* |
| * Mark as guard pages (or page), that will allow to |
| * merge back to allocator when buddy will be freed. |
| * Corresponding page table entries will not be touched, |
| * pages will stay not present in virtual address space |
| */ |
| if (set_page_guard(zone, &page[size], high)) |
| continue; |
| |
| __add_to_free_list(&page[size], zone, high, migratetype, false); |
| set_buddy_order(&page[size], high); |
| nr_added += size; |
| } |
| account_freepages(zone, nr_added, migratetype); |
| } |
| |
| static void check_new_page_bad(struct page *page) |
| { |
| if (unlikely(page->flags & __PG_HWPOISON)) { |
| /* Don't complain about hwpoisoned pages */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| return; |
| } |
| |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); |
| } |
| |
| /* |
| * This page is about to be returned from the page allocator |
| */ |
| static bool check_new_page(struct page *page) |
| { |
| if (likely(page_expected_state(page, |
| PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) |
| return false; |
| |
| check_new_page_bad(page); |
| return true; |
| } |
| |
| static inline bool check_new_pages(struct page *page, unsigned int order) |
| { |
| if (is_check_pages_enabled()) { |
| for (int i = 0; i < (1 << order); i++) { |
| struct page *p = page + i; |
| |
| if (check_new_page(p)) |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| static inline bool should_skip_kasan_unpoison(gfp_t flags) |
| { |
| /* Don't skip if a software KASAN mode is enabled. */ |
| if (IS_ENABLED(CONFIG_KASAN_GENERIC) || |
| IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
| return false; |
| |
| /* Skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return true; |
| |
| /* |
| * With hardware tag-based KASAN enabled, skip if this has been |
| * requested via __GFP_SKIP_KASAN. |
| */ |
| return flags & __GFP_SKIP_KASAN; |
| } |
| |
| static inline bool should_skip_init(gfp_t flags) |
| { |
| /* Don't skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return false; |
| |
| /* For hardware tag-based KASAN, skip if requested. */ |
| return (flags & __GFP_SKIP_ZERO); |
| } |
| |
| inline void post_alloc_hook(struct page *page, unsigned int order, |
| gfp_t gfp_flags) |
| { |
| bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && |
| !should_skip_init(gfp_flags); |
| bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); |
| int i; |
| |
| set_page_private(page, 0); |
| set_page_refcounted(page); |
| |
| arch_alloc_page(page, order); |
| debug_pagealloc_map_pages(page, 1 << order); |
| |
| /* |
| * Page unpoisoning must happen before memory initialization. |
| * Otherwise, the poison pattern will be overwritten for __GFP_ZERO |
| * allocations and the page unpoisoning code will complain. |
| */ |
| kernel_unpoison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN unpoisoning and memory initializion code must be |
| * kept together to avoid discrepancies in behavior. |
| */ |
| |
| /* |
| * If memory tags should be zeroed |
| * (which happens only when memory should be initialized as well). |
| */ |
| if (zero_tags) { |
| /* Initialize both memory and memory tags. */ |
| for (i = 0; i != 1 << order; ++i) |
| tag_clear_highpage(page + i); |
| |
| /* Take note that memory was initialized by the loop above. */ |
| init = false; |
| } |
| if (!should_skip_kasan_unpoison(gfp_flags) && |
| kasan_unpoison_pages(page, order, init)) { |
| /* Take note that memory was initialized by KASAN. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } else { |
| /* |
| * If memory tags have not been set by KASAN, reset the page |
| * tags to ensure page_address() dereferencing does not fault. |
| */ |
| for (i = 0; i != 1 << order; ++i) |
| page_kasan_tag_reset(page + i); |
| } |
| /* If memory is still not initialized, initialize it now. */ |
| if (init) |
| kernel_init_pages(page, 1 << order); |
| |
| set_page_owner(page, order, gfp_flags); |
| page_table_check_alloc(page, order); |
| pgalloc_tag_add(page, current, 1 << order); |
| } |
| |
| static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
| unsigned int alloc_flags) |
| { |
| post_alloc_hook(page, order, gfp_flags); |
| |
| if (order && (gfp_flags & __GFP_COMP)) |
| prep_compound_page(page, order); |
| |
| /* |
| * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to |
| * allocate the page. The expectation is that the caller is taking |
| * steps that will free more memory. The caller should avoid the page |
| * being used for !PFMEMALLOC purposes. |
| */ |
| if (alloc_flags & ALLOC_NO_WATERMARKS) |
| set_page_pfmemalloc(page); |
| else |
| clear_page_pfmemalloc(page); |
| } |
| |
| /* |
| * Go through the free lists for the given migratetype and remove |
| * the smallest available page from the freelists |
| */ |
| static __always_inline |
| struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
| int migratetype) |
| { |
| unsigned int current_order; |
| struct free_area *area; |
| struct page *page; |
| |
| /* Find a page of the appropriate size in the preferred list */ |
| for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { |
| area = &(zone->free_area[current_order]); |
| page = get_page_from_free_area(area, migratetype); |
| if (!page) |
| continue; |
| del_page_from_free_list(page, zone, current_order, migratetype); |
| expand(zone, page, order, current_order, migratetype); |
| trace_mm_page_alloc_zone_locked(page, order, migratetype, |
| pcp_allowed_order(order) && |
| migratetype < MIGRATE_PCPTYPES); |
| return page; |
| } |
| |
| return NULL; |
| } |
| |
| |
| /* |
| * This array describes the order lists are fallen back to when |
| * the free lists for the desirable migrate type are depleted |
| * |
| * The other migratetypes do not have fallbacks. |
| */ |
| static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = { |
| [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, |
| [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, |
| [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, |
| }; |
| |
| #ifdef CONFIG_CMA |
| static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) |
| { |
| return __rmqueue_smallest(zone, order, MIGRATE_CMA); |
| } |
| #else |
| static inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) { return NULL; } |
| #endif |
| |
| /* |
| * Change the type of a block and move all its free pages to that |
| * type's freelist. |
| */ |
| static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, |
| int old_mt, int new_mt) |
| { |
| struct page *page; |
| unsigned long pfn, end_pfn; |
| unsigned int order; |
| int pages_moved = 0; |
| |
| VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); |
| end_pfn = pageblock_end_pfn(start_pfn); |
| |
| for (pfn = start_pfn; pfn < end_pfn;) { |
| page = pfn_to_page(pfn); |
| if (!PageBuddy(page)) { |
| pfn++; |
| continue; |
| } |
| |
| /* Make sure we are not inadvertently changing nodes */ |
| VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
| VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
| |
| order = buddy_order(page); |
| |
| move_to_free_list(page, zone, order, old_mt, new_mt); |
| |
| pfn += 1 << order; |
| pages_moved += 1 << order; |
| } |
| |
| set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); |
| |
| return pages_moved; |
| } |
| |
| static bool prep_move_freepages_block(struct zone *zone, struct page *page, |
| unsigned long *start_pfn, |
| int *num_free, int *num_movable) |
| { |
| unsigned long pfn, start, end; |
| |
| pfn = page_to_pfn(page); |
| start = pageblock_start_pfn(pfn); |
| end = pageblock_end_pfn(pfn); |
| |
| /* |
| * The caller only has the lock for @zone, don't touch ranges |
| * that straddle into other zones. While we could move part of |
| * the range that's inside the zone, this call is usually |
| * accompanied by other operations such as migratetype updates |
| * which also should be locked. |
| */ |
| if (!zone_spans_pfn(zone, start)) |
| return false; |
| if (!zone_spans_pfn(zone, end - 1)) |
| return false; |
| |
| *start_pfn = start; |
| |
| if (num_free) { |
| *num_free = 0; |
| *num_movable = 0; |
| for (pfn = start; pfn < end;) { |
| page = pfn_to_page(pfn); |
| if (PageBuddy(page)) { |
| int nr = 1 << buddy_order(page); |
| |
| *num_free += nr; |
| pfn += nr; |
| continue; |
| } |
| /* |
| * We assume that pages that could be isolated for |
| * migration are movable. But we don't actually try |
| * isolating, as that would be expensive. |
| */ |
| if (PageLRU(page) || __PageMovable(page)) |
| (*num_movable)++; |
| pfn++; |
| } |
| } |
| |
| return true; |
| } |
| |
| static int move_freepages_block(struct zone *zone, struct page *page, |
| int old_mt, int new_mt) |
| { |
| unsigned long start_pfn; |
| |
| if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) |
| return -1; |
| |
| return __move_freepages_block(zone, start_pfn, old_mt, new_mt); |
| } |
| |
| #ifdef CONFIG_MEMORY_ISOLATION |
| /* Look for a buddy that straddles start_pfn */ |
| static unsigned long find_large_buddy(unsigned long start_pfn) |
| { |
| int order = 0; |
| struct page *page; |
| unsigned long pfn = start_pfn; |
| |
| while (!PageBuddy(page = pfn_to_page(pfn))) { |
| /* Nothing found */ |
| if (++order > MAX_PAGE_ORDER) |
| return start_pfn; |
| pfn &= ~0UL << order; |
| } |
| |
| /* |
| * Found a preceding buddy, but does it straddle? |
| */ |
| if (pfn + (1 << buddy_order(page)) > start_pfn) |
| return pfn; |
| |
| /* Nothing found */ |
| return start_pfn; |
| } |
| |
| /* Split a multi-block free page into its individual pageblocks */ |
| static void split_large_buddy(struct zone *zone, struct page *page, |
| unsigned long pfn, int order) |
| { |
| unsigned long end_pfn = pfn + (1 << order); |
| |
| VM_WARN_ON_ONCE(order <= pageblock_order); |
| VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); |
| |
| /* Caller removed page from freelist, buddy info cleared! */ |
| VM_WARN_ON_ONCE(PageBuddy(page)); |
| |
| while (pfn != end_pfn) { |
| int mt = get_pfnblock_migratetype(page, pfn); |
| |
| __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); |
| pfn += pageblock_nr_pages; |
| page = pfn_to_page(pfn); |
| } |
| } |
| |
| /** |
| * move_freepages_block_isolate - move free pages in block for page isolation |
| * @zone: the zone |
| * @page: the pageblock page |
| * @migratetype: migratetype to set on the pageblock |
| * |
| * This is similar to move_freepages_block(), but handles the special |
| * case encountered in page isolation, where the block of interest |
| * might be part of a larger buddy spanning multiple pageblocks. |
| * |
| * Unlike the regular page allocator path, which moves pages while |
| * stealing buddies off the freelist, page isolation is interested in |
| * arbitrary pfn ranges that may have overlapping buddies on both ends. |
| * |
| * This function handles that. Straddling buddies are split into |
| * individual pageblocks. Only the block of interest is moved. |
| * |
| * Returns %true if pages could be moved, %false otherwise. |
| */ |
| bool move_freepages_block_isolate(struct zone *zone, struct page *page, |
| int migratetype) |
| { |
| unsigned long start_pfn, pfn; |
| |
| if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) |
| return false; |
| |
| /* No splits needed if buddies can't span multiple blocks */ |
| if (pageblock_order == MAX_PAGE_ORDER) |
| goto move; |
| |
| /* We're a tail block in a larger buddy */ |
| pfn = find_large_buddy(start_pfn); |
| if (pfn != start_pfn) { |
| struct page *buddy = pfn_to_page(pfn); |
| int order = buddy_order(buddy); |
| |
| del_page_from_free_list(buddy, zone, order, |
| get_pfnblock_migratetype(buddy, pfn)); |
| set_pageblock_migratetype(page, migratetype); |
| split_large_buddy(zone, buddy, pfn, order); |
| return true; |
| } |
| |
| /* We're the starting block of a larger buddy */ |
| if (PageBuddy(page) && buddy_order(page) > pageblock_order) { |
| int order = buddy_order(page); |
| |
| del_page_from_free_list(page, zone, order, |
| get_pfnblock_migratetype(page, pfn)); |
| set_pageblock_migratetype(page, migratetype); |
| split_large_buddy(zone, page, pfn, order); |
| return true; |
| } |
| move: |
| __move_freepages_block(zone, start_pfn, |
| get_pfnblock_migratetype(page, start_pfn), |
| migratetype); |
| return true; |
| } |
| #endif /* CONFIG_MEMORY_ISOLATION */ |
| |
| static void change_pageblock_range(struct page *pageblock_page, |
| int start_order, int migratetype) |
| { |
| int nr_pageblocks = 1 << (start_order - pageblock_order); |
| |
| while (nr_pageblocks--) { |
| set_pageblock_migratetype(pageblock_page, migratetype); |
| pageblock_page += pageblock_nr_pages; |
| } |
| } |
| |
| /* |
| * When we are falling back to another migratetype during allocation, try to |
| * steal extra free pages from the same pageblocks to satisfy further |
| * allocations, instead of polluting multiple pageblocks. |
| * |
| * If we are stealing a relatively large buddy page, it is likely there will |
| * be more free pages in the pageblock, so try to steal them all. For |
| * reclaimable and unmovable allocations, we steal regardless of page size, |
| * as fragmentation caused by those allocations polluting movable pageblocks |
| * is worse than movable allocations stealing from unmovable and reclaimable |
| * pageblocks. |
| */ |
| static bool can_steal_fallback(unsigned int order, int start_mt) |
| { |
| /* |
| * Leaving this order check is intended, although there is |
| * relaxed order check in next check. The reason is that |
| * we can actually steal whole pageblock if this condition met, |
| * but, below check doesn't guarantee it and that is just heuristic |
| * so could be changed anytime. |
| */ |
| if (order >= pageblock_order) |
| return true; |
| |
| if (order >= pageblock_order / 2 || |
| start_mt == MIGRATE_RECLAIMABLE || |
| start_mt == MIGRATE_UNMOVABLE || |
| page_group_by_mobility_disabled) |
| return true; |
| |
| return false; |
| } |
| |
| static inline bool boost_watermark(struct zone *zone) |
| { |
| unsigned long max_boost; |
| |
| if (!watermark_boost_factor) |
| return false; |
| /* |
| * Don't bother in zones that are unlikely to produce results. |
| * On small machines, including kdump capture kernels running |
| * in a small area, boosting the watermark can cause an out of |
| * memory situation immediately. |
| */ |
| if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) |
| return false; |
| |
| max_boost = mult_frac(zone->_watermark[WMARK_HIGH], |
| watermark_boost_factor, 10000); |
| |
| /* |
| * high watermark may be uninitialised if fragmentation occurs |
| * very early in boot so do not boost. We do not fall |
| * through and boost by pageblock_nr_pages as failing |
| * allocations that early means that reclaim is not going |
| * to help and it may even be impossible to reclaim the |
| * boosted watermark resulting in a hang. |
| */ |
| if (!max_boost) |
| return false; |
| |
| max_boost = max(pageblock_nr_pages, max_boost); |
| |
| zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, |
| max_boost); |
| |
| return true; |
| } |
| |
| /* |
| * This function implements actual steal behaviour. If order is large enough, we |
| * can claim the whole pageblock for the requested migratetype. If not, we check |
| * the pageblock for constituent pages; if at least half of the pages are free |
| * or compatible, we can still claim the whole block, so pages freed in the |
| * future will be put on the correct free list. Otherwise, we isolate exactly |
| * the order we need from the fallback block and leave its migratetype alone. |
| */ |
| static struct page * |
| steal_suitable_fallback(struct zone *zone, struct page *page, |
| int current_order, int order, int start_type, |
| unsigned int alloc_flags, bool whole_block) |
| { |
| int free_pages, movable_pages, alike_pages; |
| unsigned long start_pfn; |
| int block_type; |
| |
| block_type = get_pageblock_migratetype(page); |
| |
| /* |
| * This can happen due to races and we want to prevent broken |
| * highatomic accounting. |
| */ |
| if (is_migrate_highatomic(block_type)) |
| goto single_page; |
| |
| /* Take ownership for orders >= pageblock_order */ |
| if (current_order >= pageblock_order) { |
| del_page_from_free_list(page, zone, current_order, block_type); |
| change_pageblock_range(page, current_order, start_type); |
| expand(zone, page, order, current_order, start_type); |
| return page; |
| } |
| |
| /* |
| * Boost watermarks to increase reclaim pressure to reduce the |
| * likelihood of future fallbacks. Wake kswapd now as the node |
| * may be balanced overall and kswapd will not wake naturally. |
| */ |
| if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) |
| set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
| |
| /* We are not allowed to try stealing from the whole block */ |
| if (!whole_block) |
| goto single_page; |
| |
| /* moving whole block can fail due to zone boundary conditions */ |
| if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, |
| &movable_pages)) |
| goto single_page; |
| |
| /* |
| * Determine how many pages are compatible with our allocation. |
| * For movable allocation, it's the number of movable pages which |
| * we just obtained. For other types it's a bit more tricky. |
| */ |
| if (start_type == MIGRATE_MOVABLE) { |
| alike_pages = movable_pages; |
| } else { |
| /* |
| * If we are falling back a RECLAIMABLE or UNMOVABLE allocation |
| * to MOVABLE pageblock, consider all non-movable pages as |
| * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or |
| * vice versa, be conservative since we can't distinguish the |
| * exact migratetype of non-movable pages. |
| */ |
| if (block_type == MIGRATE_MOVABLE) |
| alike_pages = pageblock_nr_pages |
| - (free_pages + movable_pages); |
| else |
| alike_pages = 0; |
| } |
| /* |
| * If a sufficient number of pages in the block are either free or of |
| * compatible migratability as our allocation, claim the whole block. |
| */ |
| if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || |
| page_group_by_mobility_disabled) { |
| __move_freepages_block(zone, start_pfn, block_type, start_type); |
| return __rmqueue_smallest(zone, order, start_type); |
| } |
| |
| single_page: |
| del_page_from_free_list(page, zone, current_order, block_type); |
| expand(zone, page, order, current_order, block_type); |
| return page; |
| } |
| |
| /* |
| * Check whether there is a suitable fallback freepage with requested order. |
| * If only_stealable is true, this function returns fallback_mt only if |
| * we can steal other freepages all together. This would help to reduce |
| * fragmentation due to mixed migratetype pages in one pageblock. |
| */ |
| int find_suitable_fallback(struct free_area *area, unsigned int order, |
| int migratetype, bool only_stealable, bool *can_steal) |
| { |
| int i; |
| int fallback_mt; |
| |
| if (area->nr_free == 0) |
| return -1; |
| |
| *can_steal = false; |
| for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { |
| fallback_mt = fallbacks[migratetype][i]; |
| if (free_area_empty(area, fallback_mt)) |
| continue; |
| |
| if (can_steal_fallback(order, migratetype)) |
| *can_steal = true; |
| |
| if (!only_stealable) |
| return fallback_mt; |
| |
| if (*can_steal) |
| return fallback_mt; |
| } |
| |
| return -1; |
| } |
| |
| /* |
| * Reserve a pageblock for exclusive use of high-order atomic allocations if |
| * there are no empty page blocks that contain a page with a suitable order |
| */ |
| static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) |
| { |
| int mt; |
| unsigned long max_managed, flags; |
| |
| /* |
| * The number reserved as: minimum is 1 pageblock, maximum is |
| * roughly 1% of a zone. But if 1% of a zone falls below a |
| * pageblock size, then don't reserve any pageblocks. |
| * Check is race-prone but harmless. |
| */ |
| if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) |
| return; |
| max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); |
| if (zone->nr_reserved_highatomic >= max_managed) |
| return; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| /* Recheck the nr_reserved_highatomic limit under the lock */ |
| if (zone->nr_reserved_highatomic >= max_managed) |
| goto out_unlock; |
| |
| /* Yoink! */ |
| mt = get_pageblock_migratetype(page); |
| /* Only reserve normal pageblocks (i.e., they can merge with others) */ |
| if (migratetype_is_mergeable(mt)) |
| if (move_freepages_block(zone, page, mt, |
| MIGRATE_HIGHATOMIC) != -1) |
| zone->nr_reserved_highatomic += pageblock_nr_pages; |
| |
| out_unlock: |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| /* |
| * Used when an allocation is about to fail under memory pressure. This |
| * potentially hurts the reliability of high-order allocations when under |
| * intense memory pressure but failed atomic allocations should be easier |
| * to recover from than an OOM. |
| * |
| * If @force is true, try to unreserve a pageblock even though highatomic |
| * pageblock is exhausted. |
| */ |
| static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, |
| bool force) |
| { |
| struct zonelist *zonelist = ac->zonelist; |
| unsigned long flags; |
| struct zoneref *z; |
| struct zone *zone; |
| struct page *page; |
| int order; |
| int ret; |
| |
| for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, |
| ac->nodemask) { |
| /* |
| * Preserve at least one pageblock unless memory pressure |
| * is really high. |
| */ |
| if (!force && zone->nr_reserved_highatomic <= |
| pageblock_nr_pages) |
| continue; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| for (order = 0; order < NR_PAGE_ORDERS; order++) { |
| struct free_area *area = &(zone->free_area[order]); |
| int mt; |
| |
| page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); |
| if (!page) |
| continue; |
| |
| mt = get_pageblock_migratetype(page); |
| /* |
| * In page freeing path, migratetype change is racy so |
| * we can counter several free pages in a pageblock |
| * in this loop although we changed the pageblock type |
| * from highatomic to ac->migratetype. So we should |
| * adjust the count once. |
| */ |
| if (is_migrate_highatomic(mt)) { |
| /* |
| * It should never happen but changes to |
| * locking could inadvertently allow a per-cpu |
| * drain to add pages to MIGRATE_HIGHATOMIC |
| * while unreserving so be safe and watch for |
| * underflows. |
| */ |
| zone->nr_reserved_highatomic -= min( |
| pageblock_nr_pages, |
| zone->nr_reserved_highatomic); |
| } |
| |
| /* |
| * Convert to ac->migratetype and avoid the normal |
| * pageblock stealing heuristics. Minimally, the caller |
| * is doing the work and needs the pages. More |
| * importantly, if the block was always converted to |
| * MIGRATE_UNMOVABLE or another type then the number |
| * of pageblocks that cannot be completely freed |
| * may increase. |
| */ |
| ret = move_freepages_block(zone, page, mt, |
| ac->migratetype); |
| /* |
| * Reserving this block already succeeded, so this should |
| * not fail on zone boundaries. |
| */ |
| WARN_ON_ONCE(ret == -1); |
| if (ret > 0) { |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return ret; |
| } |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Try finding a free buddy page on the fallback list and put it on the free |
| * list of requested migratetype, possibly along with other pages from the same |
| * block, depending on fragmentation avoidance heuristics. Returns true if |
| * fallback was found so that __rmqueue_smallest() can grab it. |
| * |
| * The use of signed ints for order and current_order is a deliberate |
| * deviation from the rest of this file, to make the for loop |
| * condition simpler. |
| */ |
| static __always_inline struct page * |
| __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
| unsigned int alloc_flags) |
| { |
| struct free_area *area; |
| int current_order; |
| int min_order = order; |
| struct page *page; |
| int fallback_mt; |
| bool can_steal; |
| |
| /* |
| * Do not steal pages from freelists belonging to other pageblocks |
| * i.e. orders < pageblock_order. If there are no local zones free, |
| * the zonelists will be reiterated without ALLOC_NOFRAGMENT. |
| */ |
| if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) |
| min_order = pageblock_order; |
| |
| /* |
| * Find the largest available free page in the other list. This roughly |
| * approximates finding the pageblock with the most free pages, which |
| * would be too costly to do exactly. |
| */ |
| for (current_order = MAX_PAGE_ORDER; current_order >= min_order; |
| --current_order) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt == -1) |
| continue; |
| |
| /* |
| * We cannot steal all free pages from the pageblock and the |
| * requested migratetype is movable. In that case it's better to |
| * steal and split the smallest available page instead of the |
| * largest available page, because even if the next movable |
| * allocation falls back into a different pageblock than this |
| * one, it won't cause permanent fragmentation. |
| */ |
| if (!can_steal && start_migratetype == MIGRATE_MOVABLE |
| && current_order > order) |
| goto find_smallest; |
| |
| goto do_steal; |
| } |
| |
| return NULL; |
| |
| find_smallest: |
| for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt != -1) |
| break; |
| } |
| |
| /* |
| * This should not happen - we already found a suitable fallback |
| * when looking for the largest page. |
| */ |
| VM_BUG_ON(current_order > MAX_PAGE_ORDER); |
| |
| do_steal: |
| page = get_page_from_free_area(area, fallback_mt); |
| |
| /* take off list, maybe claim block, expand remainder */ |
| page = steal_suitable_fallback(zone, page, current_order, order, |
| start_migratetype, alloc_flags, can_steal); |
| |
| trace_mm_page_alloc_extfrag(page, order, current_order, |
| start_migratetype, fallback_mt); |
| |
| return page; |
| } |
| |
| /* |
| * Do the hard work of removing an element from the buddy allocator. |
| * Call me with the zone->lock already held. |
| */ |
| static __always_inline struct page * |
| __rmqueue(struct zone *zone, unsigned int order, int migratetype, |
| unsigned int alloc_flags) |
| { |
| struct page *page; |
| |
| if (IS_ENABLED(CONFIG_CMA)) { |
| /* |
| * Balance movable allocations between regular and CMA areas by |
| * allocating from CMA when over half of the zone's free memory |
| * is in the CMA area. |
| */ |
| if (alloc_flags & ALLOC_CMA && |
| zone_page_state(zone, NR_FREE_CMA_PAGES) > |
| zone_page_state(zone, NR_FREE_PAGES) / 2) { |
| page = __rmqueue_cma_fallback(zone, order); |
| if (page) |
| return page; |
| } |
| } |
| |
| page = __rmqueue_smallest(zone, order, migratetype); |
| if (unlikely(!page)) { |
| if (alloc_flags & ALLOC_CMA) |
| page = __rmqueue_cma_fallback(zone, order); |
| |
| if (!page) |
| page = __rmqueue_fallback(zone, order, migratetype, |
| alloc_flags); |
| } |
| return page; |
| } |
| |
| /* |
| * Obtain a specified number of elements from the buddy allocator, all under |
| * a single hold of the lock, for efficiency. Add them to the supplied list. |
| * Returns the number of new pages which were placed at *list. |
| */ |
| static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| unsigned long count, struct list_head *list, |
| int migratetype, unsigned int alloc_flags) |
| { |
| unsigned long flags; |
| int i; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| for (i = 0; i < count; ++i) { |
| struct page *page = __rmqueue(zone, order, migratetype, |
| alloc_flags); |
| if (unlikely(page == NULL)) |
| break; |
| |
| /* |
| * Split buddy pages returned by expand() are received here in |
| * physical page order. The page is added to the tail of |
| * caller's list. From the callers perspective, the linked list |
| * is ordered by page number under some conditions. This is |
| * useful for IO devices that can forward direction from the |
| * head, thus also in the physical page order. This is useful |
| * for IO devices that can merge IO requests if the physical |
| * pages are ordered properly. |
| */ |
| list_add_tail(&page->pcp_list, list); |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| |
| return i; |
| } |
| |
| /* |
| * Called from the vmstat counter updater to decay the PCP high. |
| * Return whether there are addition works to do. |
| */ |
| int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) |
| { |
| int high_min, to_drain, batch; |
| int todo = 0; |
| |
| high_min = READ_ONCE(pcp->high_min); |
| batch = READ_ONCE(pcp->batch); |
| /* |
| * Decrease pcp->high periodically to try to free possible |
| * idle PCP pages. And, avoid to free too many pages to |
| * control latency. This caps pcp->high decrement too. |
| */ |
| if (pcp->high > high_min) { |
| pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), |
| pcp->high - (pcp->high >> 3), high_min); |
| if (pcp->high > high_min) |
| todo++; |
| } |
| |
| to_drain = pcp->count - pcp->high; |
| if (to_drain > 0) { |
| spin_lock(&pcp->lock); |
| free_pcppages_bulk(zone, to_drain, pcp, 0); |
| spin_unlock(&pcp->lock); |
| todo++; |
| } |
| |
| return todo; |
| } |
| |
| #ifdef CONFIG_NUMA |
| /* |
| * Called from the vmstat counter updater to drain pagesets of this |
| * currently executing processor on remote nodes after they have |
| * expired. |
| */ |
| void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
| { |
| int to_drain, batch; |
| |
| batch = READ_ONCE(pcp->batch); |
| to_drain = min(pcp->count, batch); |
| if (to_drain > 0) { |
| spin_lock(&pcp->lock); |
| free_pcppages_bulk(zone, to_drain, pcp, 0); |
| spin_unlock(&pcp->lock); |
| } |
| } |
| #endif |
| |
| /* |
| * Drain pcplists of the indicated processor and zone. |
| */ |
| static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
| { |
| struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); |
| int count = READ_ONCE(pcp->count); |
| |
| while (count) { |
| int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); |
| count -= to_drain; |
| |
| spin_lock(&pcp->lock); |
| free_pcppages_bulk(zone, to_drain, pcp, 0); |
| spin_unlock(&pcp->lock); |
| } |
| } |
| |
| /* |
| * Drain pcplists of all zones on the indicated processor. |
| */ |
| static void drain_pages(unsigned int cpu) |
| { |
| struct zone *zone; |
| |
| for_each_populated_zone(zone) { |
| drain_pages_zone(cpu, zone); |
| } |
| } |
| |
| /* |
| * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
| */ |
| void drain_local_pages(struct zone *zone) |
| { |
| int cpu = smp_processor_id(); |
| |
| if (zone) |
| drain_pages_zone(cpu, zone); |
| else |
| drain_pages(cpu); |
| } |
| |
| /* |
| * The implementation of drain_all_pages(), exposing an extra parameter to |
| * drain on all cpus. |
| * |
| * drain_all_pages() is optimized to only execute on cpus where pcplists are |
| * not empty. The check for non-emptiness can however race with a free to |
| * pcplist that has not yet increased the pcp->count from 0 to 1. Callers |
| * that need the guarantee that every CPU has drained can disable the |
| * optimizing racy check. |
| */ |
| static void __drain_all_pages(struct zone *zone, bool force_all_cpus) |
| { |
| int cpu; |
| |
| /* |
| * Allocate in the BSS so we won't require allocation in |
| * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y |
| */ |
| static cpumask_t cpus_with_pcps; |
| |
| /* |
| * Do not drain if one is already in progress unless it's specific to |
| * a zone. Such callers are primarily CMA and memory hotplug and need |
| * the drain to be complete when the call returns. |
| */ |
| if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { |
| if (!zone) |
| return; |
| mutex_lock(&pcpu_drain_mutex); |
| } |
| |
| /* |
| * We don't care about racing with CPU hotplug event |
| * as offline notification will cause the notified |
| * cpu to drain that CPU pcps and on_each_cpu_mask |
| * disables preemption as part of its processing |
| */ |
| for_each_online_cpu(cpu) { |
| struct per_cpu_pages *pcp; |
| struct zone *z; |
| bool has_pcps = false; |
| |
| if (force_all_cpus) { |
| /* |
| * The pcp.count check is racy, some callers need a |
| * guarantee that no cpu is missed. |
| */ |
| has_pcps = true; |
| } else if (zone) { |
| pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); |
| if (pcp->count) |
| has_pcps = true; |
| } else { |
| for_each_populated_zone(z) { |
| pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); |
| if (pcp->count) { |
| has_pcps = true; |
| break; |
| } |
| } |
| } |
| |
| if (has_pcps) |
| cpumask_set_cpu(cpu, &cpus_with_pcps); |
| else |
| cpumask_clear_cpu(cpu, &cpus_with_pcps); |
| } |
| |
| for_each_cpu(cpu, &cpus_with_pcps) { |
| if (zone) |
| drain_pages_zone(cpu, zone); |
| else |
| drain_pages(cpu); |
| } |
| |
| mutex_unlock(&pcpu_drain_mutex); |
| } |
| |
| /* |
| * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
| * |
| * When zone parameter is non-NULL, spill just the single zone's pages. |
| */ |
| void drain_all_pages(struct zone *zone) |
| { |
| __drain_all_pages(zone, false); |
| } |
| |
| static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) |
| { |
| int min_nr_free, max_nr_free; |
| |
| /* Free as much as possible if batch freeing high-order pages. */ |
| if (unlikely(free_high)) |
| return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX); |
| |
| /* Check for PCP disabled or boot pageset */ |
| if (unlikely(high < batch)) |
| return 1; |
| |
| /* Leave at least pcp->batch pages on the list */ |
| min_nr_free = batch; |
| max_nr_free = high - batch; |
| |
| /* |
| * Increase the batch number to the number of the consecutive |
| * freed pages to reduce zone lock contention. |
| */ |
| batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free); |
| |
| return batch; |
| } |
| |
| static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, |
| int batch, bool free_high) |
| { |
| int high, high_min, high_max; |
| |
| high_min = READ_ONCE(pcp->high_min); |
| high_max = READ_ONCE(pcp->high_max); |
| high = pcp->high = clamp(pcp->high, high_min, high_max); |
| |
| if (unlikely(!high)) |
| return 0; |
| |
| if (unlikely(free_high)) { |
| pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX), |
| high_min); |
| return 0; |
| } |
| |
| /* |
| * If reclaim is active, limit the number of pages that can be |
| * stored on pcp lists |
| */ |
| if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { |
| int free_count = max_t(int, pcp->free_count, batch); |
| |
| pcp->high = max(high - free_count, high_min); |
| return min(batch << 2, pcp->high); |
| } |
| |
| if (high_min == high_max) |
| return high; |
| |
| if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { |
| int free_count = max_t(int, pcp->free_count, batch); |
| |
| pcp->high = max(high - free_count, high_min); |
| high = max(pcp->count, high_min); |
| } else if (pcp->count >= high) { |
| int need_high = pcp->free_count + batch; |
| |
| /* pcp->high should be large enough to hold batch freed pages */ |
| if (pcp->high < need_high) |
| pcp->high = clamp(need_high, high_min, high_max); |
| } |
| |
| return high; |
| } |
| |
| static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, |
| struct page *page, int migratetype, |
| unsigned int order) |
| { |
| int high, batch; |
| int pindex; |
| bool free_high = false; |
| |
| /* |
| * On freeing, reduce the number of pages that are batch allocated. |
| * See nr_pcp_alloc() where alloc_factor is increased for subsequent |
| * allocations. |
| */ |
| pcp->alloc_factor >>= 1; |
| __count_vm_events(PGFREE, 1 << order); |
| pindex = order_to_pindex(migratetype, order); |
| list_add(&page->pcp_list, &pcp->lists[pindex]); |
| pcp->count += 1 << order; |
| |
| batch = READ_ONCE(pcp->batch); |
| /* |
| * As high-order pages other than THP's stored on PCP can contribute |
| * to fragmentation, limit the number stored when PCP is heavily |
| * freeing without allocation. The remainder after bulk freeing |
| * stops will be drained from vmstat refresh context. |
| */ |
| if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { |
| free_high = (pcp->free_count >= batch && |
| (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && |
| (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || |
| pcp->count >= READ_ONCE(batch))); |
| pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; |
| } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { |
| pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; |
| } |
| if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) |
| pcp->free_count += (1 << order); |
| high = nr_pcp_high(pcp, zone, batch, free_high); |
| if (pcp->count >= high) { |
| free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), |
| pcp, pindex); |
| if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && |
| zone_watermark_ok(zone, 0, high_wmark_pages(zone), |
| ZONE_MOVABLE, 0)) |
| clear_bit(ZONE_BELOW_HIGH, &zone->flags); |
| } |
| } |
| |
| /* |
| * Free a pcp page |
| */ |
| void free_unref_page(struct page *page, unsigned int order) |
| { |
| unsigned long __maybe_unused UP_flags; |
| struct per_cpu_pages *pcp; |
| struct zone *zone; |
| unsigned long pfn = page_to_pfn(page); |
| int migratetype; |
| |
| if (!pcp_allowed_order(order)) { |
| __free_pages_ok(page, order, FPI_NONE); |
| return; |
| } |
| |
| if (!free_pages_prepare(page, order)) |
| return; |
| |
| /* |
| * We only track unmovable, reclaimable and movable on pcp lists. |
| * Place ISOLATE pages on the isolated list because they are being |
| * offlined but treat HIGHATOMIC and CMA as movable pages so we can |
| * get those areas back if necessary. Otherwise, we may have to free |
| * excessively into the page allocator |
| */ |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { |
| if (unlikely(is_migrate_isolate(migratetype))) { |
| free_one_page(page_zone(page), page, pfn, order, FPI_NONE); |
| return; |
| } |
| migratetype = MIGRATE_MOVABLE; |
| } |
| |
| zone = page_zone(page); |
| pcp_trylock_prepare(UP_flags); |
| pcp = pcp_spin_trylock(zone->per_cpu_pageset); |
| if (pcp) { |
| free_unref_page_commit(zone, pcp, page, migratetype, order); |
| pcp_spin_unlock(pcp); |
| } else { |
| free_one_page(zone, page, pfn, order, FPI_NONE); |
| } |
| pcp_trylock_finish(UP_flags); |
| } |
| |
| /* |
| * Free a batch of folios |
| */ |
| void free_unref_folios(struct folio_batch *folios) |
| { |
| unsigned long __maybe_unused UP_flags; |
| struct per_cpu_pages *pcp = NULL; |
| struct zone *locked_zone = NULL; |
| int i, j; |
| |
| /* Prepare folios for freeing */ |
| for (i = 0, j = 0; i < folios->nr; i++) { |
| struct folio *folio = folios->folios[i]; |
| unsigned long pfn = folio_pfn(folio); |
| unsigned int order = folio_order(folio); |
| |
| if (order > 0 && folio_test_large_rmappable(folio)) |
| folio_undo_large_rmappable(folio); |
| if (!free_pages_prepare(&folio->page, order)) |
| continue; |
| /* |
| * Free orders not handled on the PCP directly to the |
| * allocator. |
| */ |
| if (!pcp_allowed_order(order)) { |
| free_one_page(folio_zone(folio), &folio->page, |
| pfn, order, FPI_NONE); |
| continue; |
| } |
| folio->private = (void *)(unsigned long)order; |
| if (j != i) |
| folios->folios[j] = folio; |
| j++; |
| } |
| folios->nr = j; |
| |
| for (i = 0; i < folios->nr; i++) { |
| struct folio *folio = folios->folios[i]; |
| struct zone *zone = folio_zone(folio); |
| unsigned long pfn = folio_pfn(folio); |
| unsigned int order = (unsigned long)folio->private; |
| int migratetype; |
| |
| folio->private = NULL; |
| migratetype = get_pfnblock_migratetype(&folio->page, pfn); |
| |
| /* Different zone requires a different pcp lock */ |
| if (zone != locked_zone || |
| is_migrate_isolate(migratetype)) { |
| if (pcp) { |
| pcp_spin_unlock(pcp); |
| pcp_trylock_finish(UP_flags); |
| locked_zone = NULL; |
| pcp = NULL; |
| } |
| |
| /* |
| * Free isolated pages directly to the |
| * allocator, see comment in free_unref_page. |
| */ |
| if (is_migrate_isolate(migratetype)) { |
| free_one_page(zone, &folio->page, pfn, |
| order, FPI_NONE); |
| continue; |
| } |
| |
| /* |
| * trylock is necessary as folios may be getting freed |
| * from IRQ or SoftIRQ context after an IO completion. |
| */ |
| pcp_trylock_prepare(UP_flags); |
| pcp = pcp_spin_trylock(zone->per_cpu_pageset); |
| if (unlikely(!pcp)) { |
| pcp_trylock_finish(UP_flags); |
| free_one_page(zone, &folio->page, pfn, |
| order, FPI_NONE); |
| continue; |
| } |
| locked_zone = zone; |
| } |
| |
| /* |
| * Non-isolated types over MIGRATE_PCPTYPES get added |
| * to the MIGRATE_MOVABLE pcp list. |
| */ |
| if (unlikely(migratetype >= MIGRATE_PCPTYPES)) |
| migratetype = MIGRATE_MOVABLE; |
| |
| trace_mm_page_free_batched(&folio->page); |
| free_unref_page_commit(zone, pcp, &folio->page, migratetype, |
| order); |
| } |
| |
| if (pcp) { |
| pcp_spin_unlock(pcp); |
| pcp_trylock_finish(UP_flags); |
| } |
| folio_batch_reinit(folios); |
| } |
| |
| /* |
| * split_page takes a non-compound higher-order page, and splits it into |
| * n (1<<order) sub-pages: page[0..n] |
| * Each sub-page must be freed individually. |
| * |
| * Note: this is probably too low level an operation for use in drivers. |
| * Please consult with lkml before using this in your driver. |
| */ |
| void split_page(struct page *page, unsigned int order) |
| { |
| int i; |
| |
| VM_BUG_ON_PAGE(PageCompound(page), page); |
| VM_BUG_ON_PAGE(!page_count(page), page); |
| |
| for (i = 1; i < (1 << order); i++) |
| set_page_refcounted(page + i); |
| split_page_owner(page, order, 0); |
| pgalloc_tag_split(page, 1 << order); |
| split_page_memcg(page, order, 0); |
| } |
| EXPORT_SYMBOL_GPL(split_page); |
| |
| int __isolate_free_page(struct page *page, unsigned int order) |
| { |
| struct zone *zone = page_zone(page); |
| int mt = get_pageblock_migratetype(page); |
| |
| if (!is_migrate_isolate(mt)) { |
| unsigned long watermark; |
| /* |
| * Obey watermarks as if the page was being allocated. We can |
| * emulate a high-order watermark check with a raised order-0 |
| * watermark, because we already know our high-order page |
| * exists. |
| */ |
| watermark = zone->_watermark[WMARK_MIN] + (1UL << order); |
| if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) |
| return 0; |
| } |
| |
| del_page_from_free_list(page, zone, order, mt); |
| |
| /* |
| * Set the pageblock if the isolated page is at least half of a |
| * pageblock |
| */ |
| if (order >= pageblock_order - 1) { |
| struct page *endpage = page + (1 << order) - 1; |
| for (; page < endpage; page += pageblock_nr_pages) { |
| int mt = get_pageblock_migratetype(page); |
| /* |
| * Only change normal pageblocks (i.e., they can merge |
| * with others) |
| */ |
| if (migratetype_is_mergeable(mt)) |
| move_freepages_block(zone, page, mt, |
| MIGRATE_MOVABLE); |
| } |
| } |
| |
| return 1UL << order; |
| } |
| |
| /** |
| * __putback_isolated_page - Return a now-isolated page back where we got it |
| * @page: Page that was isolated |
| * @order: Order of the isolated page |
| * @mt: The page's pageblock's migratetype |
| * |
| * This function is meant to return a page pulled from the free lists via |
| * __isolate_free_page back to the free lists they were pulled from. |
| */ |
| void __putback_isolated_page(struct page *page, unsigned int order, int mt) |
| { |
| struct zone *zone = page_zone(page); |
| |
| /* zone lock should be held when this function is called */ |
| lockdep_assert_held(&zone->lock); |
| |
| /* Return isolated page to tail of freelist. */ |
| __free_one_page(page, page_to_pfn(page), zone, order, mt, |
| FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); |
| } |
| |
| /* |
| * Update NUMA hit/miss statistics |
| */ |
| static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, |
| long nr_account) |
| { |
| #ifdef CONFIG_NUMA |
| enum numa_stat_item local_stat = NUMA_LOCAL; |
| |
| /* skip numa counters update if numa stats is disabled */ |
| if (!static_branch_likely(&vm_numa_stat_key)) |
| return; |
| |
| if (zone_to_nid(z) != numa_node_id()) |
| local_stat = NUMA_OTHER; |
| |
| if (zone_to_nid(z) == zone_to_nid(preferred_zone)) |
| __count_numa_events(z, NUMA_HIT, nr_account); |
| else { |
| __count_numa_events(z, NUMA_MISS, nr_account); |
| __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); |
| } |
| __count_numa_events(z, local_stat, nr_account); |
| #endif |
| } |
| |
| static __always_inline |
| struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, |
| unsigned int order, unsigned int alloc_flags, |
| int migratetype) |
| { |
| struct page *page; |
| unsigned long flags; |
| |
| do { |
| page = NULL; |
| spin_lock_irqsave(&zone->lock, flags); |
| if (alloc_flags & ALLOC_HIGHATOMIC) |
| page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); |
| if (!page) { |
| page = __rmqueue(zone, order, migratetype, alloc_flags); |
| |
| /* |
| * If the allocation fails, allow OOM handling access |
| * to HIGHATOMIC reserves as failing now is worse than |
| * failing a high-order atomic allocation in the |
| * future. |
| */ |
| if (!page && (alloc_flags & ALLOC_OOM)) |
| page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); |
| |
| if (!page) { |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return NULL; |
| } |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } while (check_new_pages(page, order)); |
| |
| __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
| zone_statistics(preferred_zone, zone, 1); |
| |
| return page; |
| } |
| |
| static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) |
| { |
| int high, base_batch, batch, max_nr_alloc; |
| int high_max, high_min; |
| |
| base_batch = READ_ONCE(pcp->batch); |
| high_min = READ_ONCE(pcp->high_min); |
| high_max = READ_ONCE(pcp->high_max); |
| high = pcp->high = clamp(pcp->high, high_min, high_max); |
| |
| /* Check for PCP disabled or boot pageset */ |
| if (unlikely(high < base_batch)) |
| return 1; |
| |
| if (order) |
| batch = base_batch; |
| else |
| batch = (base_batch << pcp->alloc_factor); |
| |
| /* |
| * If we had larger pcp->high, we could avoid to allocate from |
| * zone. |
| */ |
| if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags)) |
| high = pcp->high = min(high + batch, high_max); |
| |
| if (!order) { |
| max_nr_alloc = max(high - pcp->count - base_batch, base_batch); |
| /* |
| * Double the number of pages allocated each time there is |
| * subsequent allocation of order-0 pages without any freeing. |
| */ |
| if (batch <= max_nr_alloc && |
| pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) |
| pcp->alloc_factor++; |
| batch = min(batch, max_nr_alloc); |
| } |
| |
| /* |
| * Scale batch relative to order if batch implies free pages |
| * can be stored on the PCP. Batch can be 1 for small zones or |
| * for boot pagesets which should never store free pages as |
| * the pages may belong to arbitrary zones. |
| */ |
| if (batch > 1) |
| batch = max(batch >> order, 2); |
| |
| return batch; |
| } |
| |
| /* Remove page from the per-cpu list, caller must protect the list */ |
| static inline |
| struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, |
| int migratetype, |
| unsigned int alloc_flags, |
| struct per_cpu_pages *pcp, |
| struct list_head *list) |
| { |
| struct page *page; |
| |
| do { |
| if (list_empty(list)) { |
| int batch = nr_pcp_alloc(pcp, zone, order); |
| int alloced; |
| |
| alloced = rmqueue_bulk(zone, order, |
| batch, list, |
| migratetype, alloc_flags); |
| |
| pcp->count += alloced << order; |
| if (unlikely(list_empty(list))) |
| return NULL; |
| } |
| |
| page = list_first_entry(list, struct page, pcp_list); |
| list_del(&page->pcp_list); |
| pcp->count -= 1 << order; |
| } while (check_new_pages(page, order)); |
| |
| return page; |
| } |
| |
| /* Lock and remove page from the per-cpu list */ |
| static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
| struct zone *zone, unsigned int order, |
| int migratetype, unsigned int alloc_flags) |
| { |
| struct per_cpu_pages *pcp; |
| struct list_head *list; |
| struct page *page; |
| unsigned long __maybe_unused UP_flags; |
| |
| /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ |
| pcp_trylock_prepare(UP_flags); |
| pcp = pcp_spin_trylock(zone->per_cpu_pageset); |
| if (!pcp) { |
| pcp_trylock_finish(UP_flags); |
| return NULL; |
| } |
| |
| /* |
| * On allocation, reduce the number of pages that are batch freed. |
| * See nr_pcp_free() where free_factor is increased for subsequent |
| * frees. |
| */ |
| pcp->free_count >>= 1; |
| list = &pcp->lists[order_to_pindex(migratetype, order)]; |
| page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); |
| pcp_spin_unlock(pcp); |
| pcp_trylock_finish(UP_flags); |
| if (page) { |
| __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
| zone_statistics(preferred_zone, zone, 1); |
| } |
| return page; |
| } |
| |
| /* |
| * Allocate a page from the given zone. |
| * Use pcplists for THP or "cheap" high-order allocations. |
| */ |
| |
| /* |
| * Do not instrument rmqueue() with KMSAN. This function may call |
| * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). |
| * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it |
| * may call rmqueue() again, which will result in a deadlock. |
| */ |
| __no_sanitize_memory |
| static inline |
| struct page *rmqueue(struct zone *preferred_zone, |
| struct zone *zone, unsigned int order, |
| gfp_t gfp_flags, unsigned int alloc_flags, |
| int migratetype) |
| { |
| struct page *page; |
| |
| /* |
| * We most definitely don't want callers attempting to |
| * allocate greater than order-1 page units with __GFP_NOFAIL. |
| */ |
| WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); |
| |
| if (likely(pcp_allowed_order(order))) { |
| page = rmqueue_pcplist(preferred_zone, zone, order, |
| migratetype, alloc_flags); |
| if (likely(page)) |
| goto out; |
| } |
| |
| page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, |
| migratetype); |
| |
| out: |
| /* Separate test+clear to avoid unnecessary atomics */ |
| if ((alloc_flags & ALLOC_KSWAPD) && |
| unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { |
| clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
| wakeup_kswapd(zone, 0, 0, zone_idx(zone)); |
| } |
| |
| VM_BUG_ON_PAGE(page && bad_range(zone, page), page); |
| return page; |
| } |
| |
| noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
| { |
| return __should_fail_alloc_page(gfp_mask, order); |
| } |
| ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); |
| |
| static inline long __zone_watermark_unusable_free(struct zone *z, |
| unsigned int order, unsigned int alloc_flags) |
| { |
| long unusable_free = (1 << order) - 1; |
| |
| /* |
| * If the caller does not have rights to reserves below the min |
| * watermark then subtract the high-atomic reserves. This will |
| * over-estimate the size of the atomic reserve but it avoids a search. |
| */ |
| if (likely(!(alloc_flags & ALLOC_RESERVES))) |
| unusable_free += z->nr_reserved_highatomic; |
| |
| #ifdef CONFIG_CMA |
| /* If allocation can't use CMA areas don't use free CMA pages */ |
| if (!(alloc_flags & ALLOC_CMA)) |
| unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); |
| #endif |
| #ifdef CONFIG_UNACCEPTED_MEMORY |
| unusable_free += zone_page_state(z, NR_UNACCEPTED); |
| #endif |
| |
| return unusable_free; |
| } |
| |
| /* |
| * Return true if free base pages are above 'mark'. For high-order checks it |
| * will return true of the order-0 watermark is reached and there is at least |
| * one free page of a suitable size. Checking now avoids taking the zone lock |
| * to check in the allocation paths if no pages are free. |
| */ |
| bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
| int highest_zoneidx, unsigned int alloc_flags, |
| long free_pages) |
| { |
| long min = mark; |
| int o; |
| |
| /* free_pages may go negative - that's OK */ |
| free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); |
| |
| if (unlikely(alloc_flags & ALLOC_RESERVES)) { |
| /* |
| * __GFP_HIGH allows access to 50% of the min reserve as well |
| * as OOM. |
| */ |
| if (alloc_flags & ALLOC_MIN_RESERVE) { |
| min -= min / 2; |
| |
| /* |
| * Non-blocking allocations (e.g. GFP_ATOMIC) can |
| * access more reserves than just __GFP_HIGH. Other |
| * non-blocking allocations requests such as GFP_NOWAIT |
| * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get |
| * access to the min reserve. |
| */ |
| if (alloc_flags & ALLOC_NON_BLOCK) |
| min -= min / 4; |
| } |
| |
| /* |
| * OOM victims can try even harder than the normal reserve |
| * users on the grounds that it's definitely going to be in |
| * the exit path shortly and free memory. Any allocation it |
| * makes during the free path will be small and short-lived. |
| */ |
| if (alloc_flags & ALLOC_OOM) |
| min -= min / 2; |
| } |
| |
| /* |
| * Check watermarks for an order-0 allocation request. If these |
| * are not met, then a high-order request also cannot go ahead |
| * even if a suitable page happened to be free. |
| */ |
| if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) |
| return false; |
| |
| /* If this is an order-0 request then the watermark is fine */ |
| if (!order) |
| return true; |
| |
| /* For a high-order request, check at least one suitable page is free */ |
| for (o = order; o < NR_PAGE_ORDERS; o++) { |
| struct free_area *area = &z->free_area[o]; |
| int mt; |
| |
| if (!area->nr_free) |
| continue; |
| |
| for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { |
| if (!free_area_empty(area, mt)) |
| return true; |
| } |
| |
| #ifdef CONFIG_CMA |
| if ((alloc_flags & ALLOC_CMA) && |
| !free_area_empty(area, MIGRATE_CMA)) { |
| return true; |
| } |
| #endif |
| if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && |
| !free_area_empty(area, MIGRATE_HIGHATOMIC)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
| int highest_zoneidx, unsigned int alloc_flags) |
| { |
| return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
| zone_page_state(z, NR_FREE_PAGES)); |
| } |
| |
| static inline bool zone_watermark_fast(struct zone *z, unsigned int order, |
| unsigned long mark, int highest_zoneidx, |
| unsigned int alloc_flags, gfp_t gfp_mask) |
| { |
| long free_pages; |
| |
| free_pages = zone_page_state(z, NR_FREE_PAGES); |
| |
| /* |
| * Fast check for order-0 only. If this fails then the reserves |
| * need to be calculated. |
| */ |
| if (!order) { |
| long usable_free; |
| long reserved; |
| |
| usable_free = free_pages; |
| reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); |
| |
| /* reserved may over estimate high-atomic reserves. */ |
| usable_free -= min(usable_free, reserved); |
| if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) |
| return true; |
| } |
| |
| if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
| free_pages)) |
| return true; |
| |
| /* |
| * Ignore watermark boosting for __GFP_HIGH order-0 allocations |
| * when checking the min watermark. The min watermark is the |
| * point where boosting is ignored so that kswapd is woken up |
| * when below the low watermark. |
| */ |
| if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost |
| && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { |
| mark = z->_watermark[WMARK_MIN]; |
| return __zone_watermark_ok(z, order, mark, highest_zoneidx, |
| alloc_flags, free_pages); |
| } |
| |
| return false; |
| } |
| |
| bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
| unsigned long mark, int highest_zoneidx) |
| { |
| long free_pages = zone_page_state(z, NR_FREE_PAGES); |
| |
| if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
| free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
| |
| return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, |
| free_pages); |
| } |
| |
| #ifdef CONFIG_NUMA |
| int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; |
| |
| static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
| { |
| return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= |
| node_reclaim_distance; |
| } |
| #else /* CONFIG_NUMA */ |
| static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
| { |
| return true; |
| } |
| #endif /* CONFIG_NUMA */ |
| |
| /* |
| * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid |
| * fragmentation is subtle. If the preferred zone was HIGHMEM then |
| * premature use of a lower zone may cause lowmem pressure problems that |
| * are worse than fragmentation. If the next zone is ZONE_DMA then it is |
| * probably too small. It only makes sense to spread allocations to avoid |
| * fragmentation between the Normal and DMA32 zones. |
| */ |
| static inline unsigned int |
| alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) |
| { |
| unsigned int alloc_flags; |
| |
| /* |
| * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD |
| * to save a branch. |
| */ |
| alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); |
| |
| #ifdef CONFIG_ZONE_DMA32 |
| if (!zone) |
| return alloc_flags; |
|