blob: 14d39f34d3367fbb9683683be7e6ed150f97fae2 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
* Manages the free list, the system allocates free pages here.
* Note that kmalloc() lives in slab.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/interrupt.h>
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/pagevec.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
#include <linux/fault-inject.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
/* No special request */
#define FPI_NONE ((__force fpi_t)0)
/*
* Skip free page reporting notification for the (possibly merged) page.
* This does not hinder free page reporting from grabbing the page,
* reporting it and marking it "reported" - it only skips notifying
* the free page reporting infrastructure about a newly freed page. For
* example, used when temporarily pulling a page from a freelist and
* putting it back unmodified.
*/
#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
/*
* Place the (possibly merged) page to the tail of the freelist. Will ignore
* page shuffling (relevant code - e.g., memory onlining - is expected to
* shuffle the whole zone).
*
* Note: No code should rely on this flag for correctness - it's purely
* to allow for optimizations when handing back either fresh pages
* (memory onlining) or untouched pages (page isolation, free page
* reporting).
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/*
* On SMP, spin_trylock is sufficient protection.
* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
*/
#define pcp_trylock_prepare(flags) do { } while (0)
#define pcp_trylock_finish(flag) do { } while (0)
#else
/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
#define pcp_trylock_prepare(flags) local_irq_save(flags)
#define pcp_trylock_finish(flags) local_irq_restore(flags)
#endif
/*
* Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
* a migration causing the wrong PCP to be locked and remote memory being
* potentially allocated, pin the task to the CPU for the lookup+lock.
* preempt_disable is used on !RT because it is faster than migrate_disable.
* migrate_disable is used on RT because otherwise RT spinlock usage is
* interfered with and a high priority task cannot preempt the allocator.
*/
#ifndef CONFIG_PREEMPT_RT
#define pcpu_task_pin() preempt_disable()
#define pcpu_task_unpin() preempt_enable()
#else
#define pcpu_task_pin() migrate_disable()
#define pcpu_task_unpin() migrate_enable()
#endif
/*
* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
* Return value should be used with equivalent unlock helper.
*/
#define pcpu_spin_lock(type, member, ptr) \
({ \
type *_ret; \
pcpu_task_pin(); \
_ret = this_cpu_ptr(ptr); \
spin_lock(&_ret->member); \
_ret; \
})
#define pcpu_spin_trylock(type, member, ptr) \
({ \
type *_ret; \
pcpu_task_pin(); \
_ret = this_cpu_ptr(ptr); \
if (!spin_trylock(&_ret->member)) { \
pcpu_task_unpin(); \
_ret = NULL; \
} \
_ret; \
})
#define pcpu_spin_unlock(member, ptr) \
({ \
spin_unlock(&ptr->member); \
pcpu_task_unpin(); \
})
/* struct per_cpu_pages specific helpers. */
#define pcp_spin_lock(ptr) \
pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
#define pcp_spin_trylock(ptr) \
pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
#define pcp_spin_unlock(ptr) \
pcpu_spin_unlock(lock, ptr)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
* defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
static DEFINE_MUTEX(pcpu_drain_mutex);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif
/*
* Array of node states.
*/
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
/*
* A cached value of the page's pageblock's migratetype, used when the page is
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
* Also the migratetype set in the page does not necessarily match the pcplist
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
* other index - this ensures that it will be put on the correct CMA freelist.
*/
static inline int get_pcppage_migratetype(struct page *page)
{
return page->index;
}
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
page->index = migratetype;
}
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
#endif
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
#endif
[ZONE_MOVABLE] = 0,
};
char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
#endif
#ifdef CONFIG_ZONE_DMA32
"DMA32",
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
"HighMem",
#endif
"Movable",
#ifdef CONFIG_ZONE_DEVICE
"Device",
#endif
};
const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
"HighAtomic",
#ifdef CONFIG_CMA
"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
};
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
static bool try_to_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
DEFINE_STATIC_KEY_TRUE(deferred_pages);
static inline bool deferred_pages_enabled(void)
{
return static_branch_unlikely(&deferred_pages);
}
/*
* deferred_grow_zone() is __init, but it is called from
* get_page_from_freelist() during early boot until deferred_pages permanently
* disables this call. This is why we have refdata wrapper to avoid warning,
* and to ensure that the function body gets unloaded.
*/
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{
return deferred_grow_zone(zone, order);
}
#else
static inline bool deferred_pages_enabled(void)
{
return false;
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(const struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
unsigned long get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn, unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
/*
* This races, without locks, with set_pfnblock_flags_mask(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
static __always_inline int get_pfnblock_migratetype(const struct page *page,
unsigned long pfn)
{
return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
mask <<= bitidx;
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
do {
} while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pfnblock_flags_mask(page, (unsigned long)migratetype,
page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
ret = !zone_spans_pfn(zone, pfn);
} while (zone_span_seqretry(zone, seq));
if (ret)
pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
pfn, zone_to_nid(zone), zone->name,
start_pfn, start_pfn + sp);
return ret;
}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return true;
if (zone != page_zone(page))
return true;
return false;
}
#else
static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return false;
}
#endif
static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
goto out;
}
if (nr_unshown) {
pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
dump_page(page, reason);
print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
page_mapcount_reset(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static inline unsigned int order_to_pindex(int migratetype, int order)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
return NR_LOWORDER_PCP_LISTS;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
return (MIGRATE_PCPTYPES * order) + migratetype;
}
static inline int pindex_to_order(unsigned int pindex)
{
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pindex == NR_LOWORDER_PCP_LISTS)
order = pageblock_order;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
return order;
}
static inline bool pcp_allowed_order(unsigned int order)
{
if (order <= PAGE_ALLOC_COSTLY_ORDER)
return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order == pageblock_order)
return true;
#endif
return false;
}
static inline void free_the_page(struct page *page, unsigned int order)
{
if (pcp_allowed_order(order)) /* Via pcp? */
free_unref_page(page, order);
else
__free_pages_ok(page, order, FPI_NONE);
}
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
for (i = 1; i < nr_pages; i++)
prep_compound_tail(page, i);
prep_compound_head(page, order);
}
void destroy_large_folio(struct folio *folio)
{
if (folio_test_hugetlb(folio)) {
free_huge_folio(folio);
return;
}
if (folio_test_large_rmappable(folio))
folio_undo_large_rmappable(folio);
mem_cgroup_uncharge(folio);
free_the_page(&folio->page, folio_order(folio));
}
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
}
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
capc->cc->zone == zone ? capc : NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
if (!capc || order != capc->cc->order)
return false;
/* Do not accidentally pollute CMA or isolated regions*/
if (is_migrate_cma(migratetype) ||
is_migrate_isolate(migratetype))
return false;
/*
* Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
return false;
capc->page = page;
return true;
}
#else
static inline struct capture_control *task_capc(struct zone *zone)
{
return NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
return false;
}
#endif /* CONFIG_COMPACTION */
/* Used for pages not on another list */
static inline void add_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
/* Used for pages not on another list */
static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
/*
* Used for pages which are on another list. Move the pages to the tail
* of the list - so the moved pages won't immediately be considered for
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order)
{
/* clear reported state and update reported page count */
if (page_reported(page))
__ClearPageReported(page);
list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
}
static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
struct page, buddy_list);
}
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
unsigned long higher_page_pfn;
struct page *higher_page;
if (order >= MAX_PAGE_ORDER - 1)
return false;
higher_page_pfn = buddy_pfn & pfn;
higher_page = page + (higher_page_pfn - pfn);
return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
NULL) != NULL;
}
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PageBuddy.
* Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- nyc
*/
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
while (order < MAX_PAGE_ORDER) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
if (!buddy)
goto done_merging;
if (unlikely(order >= pageblock_order)) {
/*
* We want to prevent merge between freepages on pageblock
* without fallbacks and normal pageblock. Without this,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
if (migratetype != buddy_mt
&& (!migratetype_is_mergeable(migratetype) ||
!migratetype_is_mergeable(buddy_mt)))
goto done_merging;
}
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
done_merging:
set_buddy_order(page, order);
if (fpi_flags & FPI_TO_TAIL)
to_tail = true;
else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
if (to_tail)
add_to_free_list_tail(page, zone, order, migratetype);
else
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
/**
* split_free_page() -- split a free page at split_pfn_offset
* @free_page: the original free page
* @order: the order of the page
* @split_pfn_offset: split offset within the page
*
* Return -ENOENT if the free page is changed, otherwise 0
*
* It is used when the free page crosses two pageblocks with different migratetypes
* at split_pfn_offset within the page. The split free page will be put into
* separate migratetype lists afterwards. Otherwise, the function achieves
* nothing.
*/
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset)
{
struct zone *zone = page_zone(free_page);
unsigned long free_page_pfn = page_to_pfn(free_page);
unsigned long pfn;
unsigned long flags;
int free_page_order;
int mt;
int ret = 0;
if (split_pfn_offset == 0)
return ret;
spin_lock_irqsave(&zone->lock, flags);
if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
ret = -ENOENT;
goto out;
}
mt = get_pfnblock_migratetype(free_page, free_page_pfn);
if (likely(!is_migrate_isolate(mt)))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
del_page_from_free_list(free_page, zone, order);
for (pfn = free_page_pfn;
pfn < free_page_pfn + (1UL << order);) {
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
free_page_order = min_t(unsigned int,
pfn ? __ffs(pfn) : order,
__fls(split_pfn_offset));
__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
mt, FPI_NONE);
pfn += 1UL << free_page_order;
split_pfn_offset -= (1UL << free_page_order);
/* we have done the first part, now switch to second part */
if (split_pfn_offset == 0)
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
}
out:
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
* check if necessary.
*/
static inline bool page_expected_state(struct page *page,
unsigned long check_flags)
{
if (unlikely(atomic_read(&page->_mapcount) != -1))
return false;
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
#ifdef CONFIG_PAGE_POOL
((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
#endif
(page->flags & check_flags)))
return false;
return true;
}
static const char *page_bad_reason(struct page *page, unsigned long flags)
{
const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
if (unlikely(page->flags & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
#ifdef CONFIG_PAGE_POOL
if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
bad_reason = "page_pool leak";
#endif
return bad_reason;
}
static void free_page_is_bad_report(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
free_page_is_bad_report(page);
return true;
}
static inline bool is_check_pages_enabled(void)
{
return static_branch_unlikely(&check_pages_enabled);
}
static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
struct folio *folio = (struct folio *)head_page;
int ret = 1;
/*
* We rely page->lru.next never has bit 0 set, unless the page
* is PageTail(). Let's make sure that's true even for poisoned ->lru.
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
if (!is_check_pages_enabled()) {
ret = 0;
goto out;
}
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
if (unlikely(folio_entire_mapcount(folio))) {
bad_page(page, "nonzero entire_mapcount");
goto out;
}
if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
if (unlikely(atomic_read(&folio->_pincount))) {
bad_page(page, "nonzero pincount");
goto out;
}
break;
case 2:
/*
* the second tail page: ->mapping is
* deferred_list.next -- ignore value.
*/
break;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
out:
page->mapping = NULL;
clear_compound_head(page);
return ret;
}
/*
* Skip KASAN memory poisoning when either:
*
* 1. For generic KASAN: deferred memory initialization has not yet completed.
* Tag-based KASAN modes skip pages freed via deferred memory initialization
* using page tags instead (see below).
* 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
* that error detection is disabled for accesses via the page address.
*
* Pages will have match-all tags in the following circumstances:
*
* 1. Pages are being initialized for the first time, including during deferred
* memory init; see the call to page_kasan_tag_reset in __init_single_page.
* 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
* exception of pages unpoisoned by kasan_unpoison_vmalloc.
* 3. The allocation was excluded from being checked due to sampling,
* see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
* process and cause problem in large memory systems as the deferred pages
* initialization is done with interrupt disabled.
*
* Assuming that there will be no reference to those newly initialized
* pages before they are ever allocated, this should have no effect on
* KASAN memory tracking as the poison will be properly inserted at page
* allocation time. The only corner case is when pages are allocated by
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
static inline bool should_skip_kasan_poison(struct page *page)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return deferred_pages_enabled();
return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
static void kernel_init_pages(struct page *page, int numpages)
{
int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++)
clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
kmsan_free_page(page, order);
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
if (unlikely(PageHWPoison(page)) && !order) {
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
page_table_check_free(page, order);
return false;
}
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
int i;
if (compound)
page[1].flags &= ~PAGE_FLAGS_SECOND;
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
if (is_check_pages_enabled()) {
if (free_page_is_bad(page + i)) {
bad++;
continue;
}
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (PageMappingFlags(page))
page->mapping = NULL;
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
if (bad)
return false;
}
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
kernel_poison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* KASAN poisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
if (!skip_kasan_poison) {
kasan_poison_pages(page, order, init);
/* Memory is already initialized if KASAN did it internally. */
if (kasan_has_integrated_init())
init = false;
}
if (init)
kernel_init_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
* happen after this.
*/
arch_free_page(page, order);
debug_pagealloc_unmap_pages(page, 1 << order);
return true;
}
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
unsigned long flags;
unsigned int order;
bool isolated_pageblocks;
struct page *page;
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
spin_lock_irqsave(&zone->lock, flags);
isolated_pageblocks = has_isolate_pageblock(zone);
while (count > 0) {
struct list_head *list;
int nr_pages;
/* Remove pages from lists in a round-robin fashion. */
do {
if (++pindex > NR_PCP_LISTS - 1)
pindex = 0;
list = &pcp->lists[pindex];
} while (list_empty(list));
order = pindex_to_order(pindex);
nr_pages = 1 << order;
do {
int mt;
page = list_last_entry(list, struct page, pcp_list);
mt = get_pcppage_migratetype(page);
/* must delete to avoid corrupting pcp list */
list_del(&page->pcp_list);
count -= nr_pages;
pcp->count -= nr_pages;
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
spin_unlock_irqrestore(&zone->lock, flags);
}
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
}
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
int migratetype;
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
if (!free_pages_prepare(page, order))
return;
/*
* Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
* is used to avoid calling get_pfnblock_migratetype() under the lock.
* This will reduce the lock holding time.
*/
migratetype = get_pfnblock_migratetype(page, pfn);
free_one_page(zone, page, pfn, order, migratetype, fpi_flags);
__count_vm_events(PGFREE, 1 << order);
}
void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
/*
* When initializing the memmap, __init_single_page() sets the refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
if (page_contains_unaccepted(page, order)) {
if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
return;
accept_page(page, order);
}
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL);
}
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
* with the migration of free compaction scanner.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
* It's possible on some configurations to have a setup like node0 node1 node0
* i.e. it's possible that all pages within a zones range of pages do not
* belong to a single zone. We assume that a border between node0 and node1
* can occur within a single pageblock, but not a node0 node1 node0
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
*
* Note: the function may return non-NULL struct page even for a page block
* which contains a memory hole (i.e. there is no physical memory for a subset
* of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
* will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
* even though the start pfn is online and valid. This should be safe most of
* the time because struct pages are still initialized via init_unavailable_range()
* and pfn walkers shouldn't touch any physical memory range for which they do
* not recognize any specific metadata in struct pages.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
{
struct page *start_page;
struct page *end_page;
/* end_pfn is one past the range we are checking */
end_pfn--;
if (!pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
if (!start_page)
return NULL;
if (page_zone(start_page) != zone)
return NULL;
end_page = pfn_to_page(end_pfn);
/* This gives a shorter code than deriving page_zone(end_page) */
if (page_zone_id(start_page) != page_zone_id(end_page))
return NULL;
return start_page;
}
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}
static void check_new_page_bad(struct page *page)
{
if (unlikely(page->flags & __PG_HWPOISON)) {
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
* This page is about to be returned from the page allocator
*/
static bool check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
return false;
check_new_page_bad(page);
return true;
}
static inline bool check_new_pages(struct page *page, unsigned int order)
{
if (is_check_pages_enabled()) {
for (int i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (check_new_page(p))
return true;
}
}
return false;
}
static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
IS_ENABLED(CONFIG_KASAN_SW_TAGS))
return false;
/* Skip, if hardware tag-based KASAN is not enabled. */
if (!kasan_hw_tags_enabled())
return true;
/*
* With hardware tag-based KASAN enabled, skip if this has been
* requested via __GFP_SKIP_KASAN.
*/
return flags & __GFP_SKIP_KASAN;
}
static inline bool should_skip_init(gfp_t flags)
{
/* Don't skip, if hardware tag-based KASAN is not enabled. */
if (!kasan_hw_tags_enabled())
return false;
/* For hardware tag-based KASAN, skip if requested. */
return (flags & __GFP_SKIP_ZERO);
}
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
int i;
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
/*
* Page unpoisoning must happen before memory initialization.
* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
* allocations and the page unpoisoning code will complain.
*/
kernel_unpoison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* KASAN unpoisoning and memory initializion code must be
* kept together to avoid discrepancies in behavior.
*/
/*
* If memory tags should be zeroed
* (which happens only when memory should be initialized as well).
*/
if (zero_tags) {
/* Initialize both memory and memory tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
/* Take note that memory was initialized by the loop above. */
init = false;
}
if (!should_skip_kasan_unpoison(gfp_flags) &&
kasan_unpoison_pages(page, order, init)) {
/* Take note that memory was initialized by KASAN. */
if (kasan_has_integrated_init())
init = false;
} else {
/*
* If memory tags have not been set by KASAN, reset the page
* tags to ensure page_address() dereferencing does not fault.
*/
for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, initialize it now. */
if (init)
kernel_init_pages(page, 1 << order);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
post_alloc_hook(page, order, gfp_flags);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
}
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
return page;
}
return NULL;
}
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*
* The other migratetypes do not have fallbacks.
*/
static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
};
#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif
/*
* Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
unsigned long pfn;
unsigned int order;
int pages_moved = 0;
for (pfn = start_pfn; pfn <= end_pfn;) {
page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
pfn++;
continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
pfn += 1 << order;
pages_moved += 1 << order;
}
return pages_moved;
}
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
pfn = page_to_pfn(page);
start_pfn = pageblock_start_pfn(pfn);
end_pfn = pageblock_end_pfn(pfn) - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
int start_order, int migratetype)
{
int nr_pageblocks = 1 << (start_order - pageblock_order);
while (nr_pageblocks--) {
set_pageblock_migratetype(pageblock_page, migratetype);
pageblock_page += pageblock_nr_pages;
}
}
/*
* When we are falling back to another migratetype during allocation, try to
* steal extra free pages from the same pageblocks to satisfy further
* allocations, instead of polluting multiple pageblocks.
*
* If we are stealing a relatively large buddy page, it is likely there will
* be more free pages in the pageblock, so try to steal them all. For
* reclaimable and unmovable allocations, we steal regardless of page size,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
*/
static bool can_steal_fallback(unsigned int order, int start_mt)
{
/*
* Leaving this order check is intended, although there is
* relaxed order check in next check. The reason is that
* we can actually steal whole pageblock if this condition met,
* but, below check doesn't guarantee it and that is just heuristic
* so could be changed anytime.
*/
if (order >= pageblock_order)
return true;
if (order >= pageblock_order / 2 ||
start_mt == MIGRATE_RECLAIMABLE ||
start_mt == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled)
return true;
return false;
}
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
/*
* high watermark may be uninitialised if fragmentation occurs
* very early in boot so do not boost. We do not fall
* through and boost by pageblock_nr_pages as failing
* allocations that early means that reclaim is not going
* to help and it may even be impossible to reclaim the
* boosted watermark resulting in a hang.
*/
if (!max_boost)
return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
return true;
}
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
* pageblock to our migratetype and determine how many already-allocated pages
* are there in the pageblock with a compatible migratetype. If at least half
* of pages are free or compatible, we can change migratetype of the pageblock
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
old_block_type = get_pageblock_migratetype(page);
/*
* This can happen due to races and we want to prevent broken
* highatomic accounting.
*/
if (is_migrate_highatomic(old_block_type))
goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
goto single_page;
}
/*
* Boost watermarks to increase reclaim pressure to reduce the
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
/* moving whole block can fail due to zone boundary conditions */
if (!free_pages)
goto single_page;
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
* we just obtained. For other types it's a bit more tricky.
*/
if (start_type == MIGRATE_MOVABLE) {
alike_pages = movable_pages;
} else {
/*
* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
* to MOVABLE pageblock, consider all non-movable pages as
* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
* vice versa, be conservative since we can't distinguish the
* exact migratetype of non-movable pages.
*/
if (old_block_type == MIGRATE_MOVABLE)
alike_pages = pageblock_nr_pages
- (free_pages + movable_pages);
else
alike_pages = 0;
}
/*
* If a sufficient number of pages in the block are either free or of
* compatible migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
return;
single_page:
move_to_free_list(page, zone, current_order, start_type);
}
/*
* Check whether there is a suitable fallback freepage with requested order.
* If only_stealable is true, this function returns fallback_mt only if
* we can steal other freepages all together. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal)
{
int i;
int fallback_mt;
if (area->nr_free == 0)
return -1;
*can_steal = false;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
fallback_mt = fallbacks[migratetype][i];
if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
*can_steal = true;
if (!only_stealable)
return fallback_mt;
if (*can_steal)
return fallback_mt;
}
return -1;
}
/*
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
{
int mt;
unsigned long max_managed, flags;
/*
* The number reserved as: minimum is 1 pageblock, maximum is
* roughly 1% of a zone. But if 1% of a zone falls below a
* pageblock size, then don't reserve any pageblocks.
* Check is race-prone but harmless.
*/
if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
return;
max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
if (zone->nr_reserved_highatomic >= max_managed)
return;
spin_lock_irqsave(&zone->lock, flags);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
goto out_unlock;
/* Yoink! */
mt = get_pageblock_migratetype(page);
/* Only reserve normal pageblocks (i.e., they can merge with others) */
if (migratetype_is_mergeable(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* Used when an allocation is about to fail under memory pressure. This
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
*
* If @force is true, try to unreserve a pageblock even though highatomic
* pageblock is exhausted.
*/
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
int order;
bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
* is really high.
*/
if (!force && zone->nr_reserved_highatomic <=
pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
* in this loop although we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
* drain to add pages to MIGRATE_HIGHATOMIC
* while unreserving so be safe and watch for
* underflows.
*/
zone->nr_reserved_highatomic -= min(
pageblock_nr_pages,
zone->nr_reserved_highatomic);
}
/*
* Convert to ac->migratetype and avoid the normal
* pageblock stealing heuristics. Minimally, the caller
* is doing the work and needs the pages. More
* importantly, if the block was always converted to
* MIGRATE_UNMOVABLE or another type then the number
* of pageblocks that cannot be completely freed
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
ret = move_freepages_block(zone, page, ac->migratetype,
NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
}
/*
* Try finding a free buddy page on the fallback list and put it on the free
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
/*
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
VM_BUG_ON(current_order > MAX_PAGE_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
return true;
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
if (IS_ENABLED(CONFIG_CMA)) {
/*
* Balance movable allocations between regular and CMA areas by
* allocating from CMA when over half of the zone's free memory
* is in the CMA area.
*/
if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
return page;
}
}
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
}
return page;
}
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
unsigned long flags;
int i;
spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->pcp_list, list);
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock_irqrestore(&zone->lock, flags);
return i;
}
/*
* Called from the vmstat counter updater to decay the PCP high.
* Return whether there are addition works to do.
*/
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
int high_min, to_drain, batch;
int todo = 0;
high_min = READ_ONCE(pcp->high_min);
batch = READ_ONCE(pcp->batch);
/*
* Decrease pcp->high periodically to try to free possible
* idle PCP pages. And, avoid to free too many pages to
* control latency. This caps pcp->high decrement too.
*/
if (pcp->high > high_min) {
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
pcp->high - (pcp->high >> 3), high_min);
if (pcp->high > high_min)
todo++;
}
to_drain = pcp->count - pcp->high;
if (to_drain > 0) {
spin_lock(&pcp->lock);
free_pcppages_bulk(zone, to_drain, pcp, 0);
spin_unlock(&pcp->lock);
todo++;
}
return todo;
}
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
int to_drain, batch;
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
spin_lock(&pcp->lock);
free_pcppages_bulk(zone, to_drain, pcp, 0);
spin_unlock(&pcp->lock);
}
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
struct per_cpu_pages *pcp;
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count) {
spin_lock(&pcp->lock);
free_pcppages_bulk(zone, pcp->count, pcp, 0);
spin_unlock(&pcp->lock);
}
}
/*
* Drain pcplists of all zones on the indicated processor.
*/
static void drain_pages(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone) {
drain_pages_zone(cpu, zone);
}
}
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
void drain_local_pages(struct zone *zone)
{
int cpu = smp_processor_id();
if (zone)
drain_pages_zone(cpu, zone);
else
drain_pages(cpu);
}
/*
* The implementation of drain_all_pages(), exposing an extra parameter to
* drain on all cpus.
*
* drain_all_pages() is optimized to only execute on cpus where pcplists are
* not empty. The check for non-emptiness can however race with a free to
* pcplist that has not yet increased the pcp->count from 0 to 1. Callers
* that need the guarantee that every CPU has drained can disable the
* optimizing racy check.
*/
static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
{
int cpu;
/*
* Allocate in the BSS so we won't require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
*/
if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
if (!zone)
return;
mutex_lock(&pcpu_drain_mutex);
}
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
if (force_all_cpus) {
/*
* The pcp.count check is racy, some callers need a
* guarantee that no cpu is missed.
*/
has_pcps = true;
} else if (zone) {
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
if (pcp->count) {
has_pcps = true;
break;
}
}
}
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
for_each_cpu(cpu, &cpus_with_pcps) {
if (zone)
drain_pages_zone(cpu, zone);
else
drain_pages(cpu);
}
mutex_unlock(&pcpu_drain_mutex);
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
*/
void drain_all_pages(struct zone *zone)
{
__drain_all_pages(zone, false);
}
static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
unsigned int order)
{
int migratetype;
if (!free_pages_prepare(page, order))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
return true;
}
static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{
int min_nr_free, max_nr_free;
/* Free as much as possible if batch freeing high-order pages. */
if (unlikely(free_high))
return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
return 1;
/* Leave at least pcp->batch pages on the list */
min_nr_free = batch;
max_nr_free = high - batch;
/*
* Increase the batch number to the number of the consecutive
* freed pages to reduce zone lock contention.
*/
batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
return batch;
}
static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
int batch, bool free_high)
{
int high, high_min, high_max;
high_min = READ_ONCE(pcp->high_min);
high_max = READ_ONCE(pcp->high_max);
high = pcp->high = clamp(pcp->high, high_min, high_max);
if (unlikely(!high))
return 0;
if (unlikely(free_high)) {
pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
high_min);
return 0;
}
/*
* If reclaim is active, limit the number of pages that can be
* stored on pcp lists
*/
if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
int free_count = max_t(int, pcp->free_count, batch);
pcp->high = max(high - free_count, high_min);
return min(batch << 2, pcp->high);
}
if (high_min == high_max)
return high;
if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
int free_count = max_t(int, pcp->free_count, batch);
pcp->high = max(high - free_count, high_min);
high = max(pcp->count, high_min);
} else if (pcp->count >= high) {
int need_high = pcp->free_count + batch;
/* pcp->high should be large enough to hold batch freed pages */
if (pcp->high < need_high)
pcp->high = clamp(need_high, high_min, high_max);
}
return high;
}
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
struct page *page, int migratetype,
unsigned int order)
{
int high, batch;
int pindex;
bool free_high = false;
/*
* On freeing, reduce the number of pages that are batch allocated.
* See nr_pcp_alloc() where alloc_factor is increased for subsequent
* allocations.
*/
pcp->alloc_factor >>= 1;
__count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
batch = READ_ONCE(pcp->batch);
/*
* As high-order pages other than THP's stored on PCP can contribute
* to fragmentation, limit the number stored when PCP is heavily
* freeing without allocation. The remainder after bulk freeing
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
free_high = (pcp->free_count >= batch &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
pcp->count >= READ_ONCE(batch)));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
pcp, pindex);
if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
zone_watermark_ok(zone, 0, high_wmark_pages(zone),
ZONE_MOVABLE, 0))
clear_bit(ZONE_BELOW_HIGH, &zone->flags);
}
}
/*
* Free a pcp page
*/
void free_unref_page(struct page *page, unsigned int order)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype, pcpmigratetype;
if (!free_unref_page_prepare(page, pfn, order))
return;
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Place ISOLATE pages on the isolated list because they are being
* offlined but treat HIGHATOMIC and CMA as movable pages so we can
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
migratetype = pcpmigratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
pcpmigratetype = MIGRATE_MOVABLE;
}
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
}
pcp_trylock_finish(UP_flags);
}
/*
* Free a batch of folios
*/
void free_unref_folios(struct folio_batch *folios)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
int i, j, migratetype;
/* Prepare folios for freeing */
for (i = 0, j = 0; i < folios->nr; i++) {
struct folio *folio = folios->folios[i];
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
if (order > 0 && folio_test_large_rmappable(folio))
folio_undo_large_rmappable(folio);
if (!free_unref_page_prepare(&folio->page, pfn, order))
continue;
/*
* Free isolated folios and orders not handled on the PCP
* directly to the allocator, see comment in free_unref_page.
*/
migratetype = get_pcppage_migratetype(&folio->page);
if (!pcp_allowed_order(order) ||
is_migrate_isolate(migratetype)) {
free_one_page(folio_zone(folio), &folio->page, pfn,
order, migratetype, FPI_NONE);
continue;
}
folio->private = (void *)(unsigned long)order;
if (j != i)
folios->folios[j] = folio;
j++;
}
folios->nr = j;
for (i = 0; i < folios->nr; i++) {
struct folio *folio = folios->folios[i];
struct zone *zone = folio_zone(folio);
unsigned int order = (unsigned long)folio->private;
folio->private = NULL;
migratetype = get_pcppage_migratetype(&folio->page);
/* Different zone requires a different pcp lock */
if (zone != locked_zone) {
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
/*
* trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) {
pcp_trylock_finish(UP_flags);
free_one_page(zone, &folio->page,
folio_pfn(folio), order,
migratetype, FPI_NONE);
locked_zone = NULL;
continue;
}
locked_zone = zone;
}
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
*/
if (unlikely(migratetype >= MIGRATE_PCPTYPES))
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
free_unref_page_commit(zone, pcp, &folio->page, migratetype,
order);
}
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
folio_batch_reinit(folios);
}
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
void split_page(struct page *page, unsigned int order)
{
int i;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
struct zone *zone = page_zone(page);
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
unsigned long watermark;
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
* watermark, because we already know our high-order page
* exists.
*/
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
del_page_from_free_list(page, zone, order);
/*
* Set the pageblock if the isolated page is at least half of a
* pageblock
*/
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
/*
* Only change normal pageblocks (i.e., they can merge
* with others)
*/
if (migratetype_is_mergeable(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
}
return 1UL << order;
}
/**
* __putback_isolated_page - Return a now-isolated page back where we got it
* @page: Page that was isolated
* @order: Order of the isolated page
* @mt: The page's pageblock's migratetype
*
* This function is meant to return a page pulled from the free lists via
* __isolate_free_page back to the free lists they were pulled from.
*/
void __putback_isolated_page(struct page *page, unsigned int order, int mt)
{
struct zone *zone = page_zone(page);
/* zone lock should be held when this function is called */
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
__free_one_page(page, page_to_pfn(page), zone, order, mt,
FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
* Update NUMA hit/miss statistics
*/
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
long nr_account)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
/* skip numa counters update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return;
if (zone_to_nid(z) != numa_node_id())
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
__count_numa_events(z, NUMA_HIT, nr_account);
else {
__count_numa_events(z, NUMA_MISS, nr_account);
__count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
__count_numa_events(z, local_stat, nr_account);
#endif
}
static __always_inline
struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
unsigned int order, unsigned int alloc_flags,
int migratetype)
{
struct page *page;
unsigned long flags;
do {
page = NULL;
spin_lock_irqsave(&zone->lock, flags);
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
/*
* If the allocation fails, allow OOM handling access
* to HIGHATOMIC reserves as failing now is worse than
* failing a high-order atomic allocation in the
* future.
*/
if (!page && (alloc_flags & ALLOC_OOM))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
}
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);
} while (check_new_pages(page, order));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
return page;
}
static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
{
int high, base_batch, batch, max_nr_alloc;
int high_max, high_min;
base_batch = READ_ONCE(pcp->batch);
high_min = READ_ONCE(pcp->high_min);
high_max = READ_ONCE(pcp->high_max);
high = pcp->high = clamp(pcp->high, high_min, high_max);
/* Check for PCP disabled or boot pageset */
if (unlikely(high < base_batch))
return 1;
if (order)
batch = base_batch;
else
batch = (base_batch << pcp->alloc_factor);
/*
* If we had larger pcp->high, we could avoid to allocate from
* zone.
*/
if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
high = pcp->high = min(high + batch, high_max);
if (!order) {
max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
/*
* Double the number of pages allocated each time there is
* subsequent allocation of order-0 pages without any freeing.
*/
if (batch <= max_nr_alloc &&
pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
pcp->alloc_factor++;
batch = min(batch, max_nr_alloc);
}
/*
* Scale batch relative to order if batch implies free pages
* can be stored on the PCP. Batch can be 1 for small zones or
* for boot pagesets which should never store free pages as
* the pages may belong to arbitrary zones.
*/
if (batch > 1)
batch = max(batch >> order, 2);
return batch;
}
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
do {
if (list_empty(list)) {
int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;
alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);
pcp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}
page = list_first_entry(list, struct page, pcp_list);
list_del(&page->pcp_list);
pcp->count -= 1 << order;
} while (check_new_pages(page, order));
return page;
}
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long __maybe_unused UP_flags;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp) {
pcp_trylock_finish(UP_flags);
return NULL;
}
/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
}
/*
* Allocate a page from the given zone.
* Use pcplists for THP or "cheap" high-order allocations.
*/
/*
* Do not instrument rmqueue() with KMSAN. This function may call
* __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
struct page *page;
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
if (likely(pcp_allowed_order(order))) {
page = rmqueue_pcplist(preferred_zone, zone, order,
migratetype, alloc_flags);
if (likely(page))
goto out;
}
page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
if ((alloc_flags & ALLOC_KSWAPD) &&
unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
}
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
long unusable_free = (1 << order) - 1;
/*
* If the caller does not have rights to reserves below the min
* watermark then subtract the high-atomic reserves. This will
* over-estimate the size of the atomic reserve but it avoids a search.
*/
if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += z->nr_reserved_highatomic;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
unusable_free += zone_page_state(z, NR_UNACCEPTED);
#endif
return unusable_free;
}
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
* one free page of a suitable size. Checking now avoids taking the zone lock
* to check in the allocation paths if no pages are free.
*/
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
int o;
/* free_pages may go negative - that's OK */
free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (unlikely(alloc_flags & ALLOC_RESERVES)) {
/*
* __GFP_HIGH allows access to 50% of the min reserve as well
* as OOM.
*/
if (alloc_flags & ALLOC_MIN_RESERVE) {
min -= min / 2;
/*
* Non-blocking allocations (e.g. GFP_ATOMIC) can
* access more reserves than just __GFP_HIGH. Other
* non-blocking allocations requests such as GFP_NOWAIT
* or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
* access to the min reserve.
*/
if (alloc_flags & ALLOC_NON_BLOCK)
min -= min / 4;
}
/*
* OOM victims can try even harder than the normal reserve
* users on the grounds that it's definitely going to be in
* the exit path shortly and free memory. Any allocation it
* makes during the free path will be small and short-lived.
*/
if (alloc_flags & ALLOC_OOM)
min -= min / 2;
}
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
return false;
/* If this is an order-0 request then the watermark is fine */
if (!order)
return true;
/* For a high-order request, check at least one suitable page is free */
for (o = order; o < NR_PAGE_ORDERS; o++) {
struct free_area *area = &z->free_area[o];
int mt;
if (!area->nr_free)
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
!free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
!free_area_empty(area, MIGRATE_HIGHATOMIC)) {
return true;
}
}
return false;
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags, gfp_t gfp_mask)
{
long free_pages;
free_pages = zone_page_state(z, NR_FREE_PAGES);
/*
* Fast check for order-0 only. If this fails then the reserves
* need to be calculated.
*/
if (!order) {
long usable_free;
long reserved;
usable_free = free_pages;
reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
/* reserved may over estimate high-atomic reserves. */
usable_free -= min(usable_free, reserved);
if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages))
return true;
/*
* Ignore watermark boosting for __GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
alloc_flags, free_pages);
}
return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
free_pages);
}
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
/*
* The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
* fragmentation is subtle. If the preferred zone was HIGHMEM then
* premature use of a lower zone may cause lowmem pressure problems that
* are worse than fragmentation. If the next zone is ZONE_DMA then it is
* probably too small. It only makes sense to spread allocations to avoid
* fragmentation between the Normal and DMA32 zones.
*/
static inline unsigned int
alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
{
unsigned int alloc_flags;
/*
* __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save a branch.
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
if (zone_idx(zone) != ZONE_NORMAL)
return alloc_flags;
/*
* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
* the pointer is within zone->zone_pgdat->node_zones[]. Also assume
* on UMA that if Normal is populated then so is DMA32.
*/
BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
if (nr_online_nodes > 1 && !populated_zone(--zone))
return alloc_flags;
alloc_flags |= ALLOC_NOFRAGMENT;
#endif /* CONFIG_ZONE_DMA32 */
return alloc_flags;
}
/* Must be called after current_gfp_context() which can change gfp_mask */
static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
unsigned int alloc_flags)
{
#ifdef CONFIG_CMA
if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat != zone->zone_pgdat) {
last_pgdat = zone->zone_pgdat;
last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
}
if (!last_pgdat_dirty_ok)
continue;
}
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.