- Feb 2025
-
elixir.bootlin.com elixir.bootlin.com
-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd * and rely on compaction_suitable() to determine if it's * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) wakeup_kcompactd(pgdat, order, highest_zoneidx); return; }
The function would call compaction once: 1. the reclamation has failed many times 2. the problem comes from external fragmentation
-
-
elixir.bootlin.com elixir.bootlin.com
-
static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; int min_priority; bool ret = false; int retries = *compaction_retries; enum compact_priority priority = *compact_priority; if (!order) return false; if (fatal_signal_pending(current)) return false; /* * Compaction was skipped due to a lack of free order-0 * migration targets. Continue if reclaim can help. */ if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* * Compaction managed to coalesce some page blocks, but the * allocation failed presumably due to a race. Retry some. */ if (compact_result == COMPACT_SUCCESS) { /* * !costly requests are much more important than * __GFP_RETRY_MAYFAIL costly ones because they are de * facto nofail and invoke OOM killer to move on while * costly can fail and users are ready to cope with * that. 1/4 retries is rather arbitrary but we would * need much more detailed feedback from compaction to * make a better decision. */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; if (++(*compaction_retries) <= max_retries) { ret = true; goto out; } } /* * Compaction failed. Retry with increasing priority. */ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; ret = true; } out: trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); return ret; }
This function determines whether the system should retry memory compaction during page allocation. 1. If the allocation order is 0, no need for compaction 2. If compaction was skipped because of a lack of free order-0 pages: Check if memory reclaim can help. 3. If compaction succeeded but allocation still failed: retry 1/4 times. 4. If the compaction failed. retry with increase priority
-
-
elixir.bootlin.com elixir.bootlin.com
-
/* * A fragmentation index only makes sense if an allocation of a requested * size would fail. If that is true, the fragmentation index indicates * whether external fragmentation or a lack of memory was the problem. * The value can be used to determine if page reclaim or compaction * should be used */ static int __fragmentation_index(unsigned int order, struct contig_page_info *info) { unsigned long requested = 1UL << order; if (WARN_ON_ONCE(order > MAX_ORDER)) return 0; if (!info->free_blocks_total) return 0; /* Fragmentation index only makes sense when a request would fail */ if (info->free_blocks_suitable) return -1000; /* * Index is between 0 and 1 so return within 3 decimal places * * 0 => allocation would fail due to lack of memory * 1 => allocation would fail due to fragmentation */ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); }
An algorithm to calculate the current zone’s external fragmentation index, where a higher value indicates greater fragmentation.
-
-
elixir.bootlin.com elixir.bootlin.com
-
.mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
A migration mode config asynchronous migration : synchronous migration
-
/* Returns true if compaction should be skipped this time */ static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; if (order < zone->compact_order_failed) return false; /* Avoid possible overflow */ if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; return false; } trace_mm_compaction_deferred(zone, order); return true; }
Trivial policy. The function to decide if the system can defer current compaction request. defer -> if the number of page order >= threshold (defer_limit) The threshold is an integer.
-
-
elixir.bootlin.com elixir.bootlin.com
-
enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC };
Define the priority of compaction strategy. 0: COMPACT_PRIO_SYNC_FULL 1: COMPACT_PRIO_SYNC_LIGHT 2: COMPACT_PRIO_ASYNC
-
-
elixir.bootlin.com elixir.bootlin.com
-
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } if (page_mapcount(page) > 1) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } page = compound_head(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (PageLocked(page)) { result = SCAN_PAGE_LOCK; goto out_unmap; } if (!PageAnon(page)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (!is_refcount_suitable(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; } else if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; }
Check if normal (anonymous) pages can be collapsed
-
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { ++swap; if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } /* * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { struct page *head = compound_head(page); result = compound_order(head) == HPAGE_PMD_ORDER && head->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ break; } node = page_to_nid(page); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; break; } if (page_count(page) != 1 + page_mapcount(page) + page_has_private(page)) { result = SCAN_PAGE_COUNT; break; } /* * We probably should check if the page is referenced here, but * nobody would transfer pte_young() to PageReferenced() for us. * And rmap walk here is just too costly... */ present++; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } }
Decide if small file pages should be collapsed. 1. swap pages 2. compound pages 3. Pinned pages
-