368 Matching Annotations
  1. Oct 2024
    1. static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed)

      The dynamic calculation of dirty_ratelimit, the use of step size adjustments, filtering of singular points to avoid spikes, and special handling for strict limits.

    2. static void domain_dirty_limits(struct dirty_throttle_control *dtc)

      Heuristic based algorithms for calculating thresh and bg_thresh. Line 419-422 adds additional threshold if the task is determined to be real time task.

    3. static void wb_position_ratio(struct dirty_throttle_control *dtc)

      dynamically calculates pos_ratio based on the state of global and per-writeback (wb) dirty limits, write bandwidth, and thresholds

    4. /* throttle according to the chosen dtc */ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); if (flags & BDP_ASYNC) { ret = -EAGAIN; break; } __set_current_state(TASK_KILLABLE); bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause;

      This part of the code makes algorithmic decision on how long a task should throttle based on the rate at which it dirties memory pages.

    1. struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { struct folio *folio = page_folio(page); struct anon_vma *anon_vma = folio_anon_vma(folio); struct page *new_page; if (PageKsm(page)) { if (page_stable_node(page) && !(ksm_run & KSM_RUN_UNMERGE)) return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ } else if (page->index == linear_page_index(vma, address) && anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!PageUptodate(page)) return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page && mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { put_page(new_page); new_page = NULL; } if (new_page) { if (copy_mc_user_highpage(new_page, page, address, vma)) { put_page(new_page); memory_failure_queue(page_to_pfn(page), 0); return ERR_PTR(-EHWPOISON); } SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_page; }

      This function decides whether a page needs to be copied based on its status as a KSM page, its association with an anonymous VMA, and its hardware-poisoned or up-to-date status.

    2. static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page; }

      This function dynamically decides when two pages should be merged into a KSM page. If the merge fails, it applies a fallback policy by breaking the COW relationship, ensuring that memory management remains consistent.

    3. static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; }

      This function checks whether the VMA is mergable and proceeds based on that condition, and it dynamically removes the reverse mapping item from the unstable tree after a successful merge.

    4. static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); }

      This function checks whether the ksmd should run based on the KSM_RUN_MERGE flag and whether there are any memory regions to process. The decision to enable or disable memory merging is controlled by the ksm_run configuration.

    5. static unsigned int ksm_thread_sleep_millisecs = 20;

      This variable sets the delay (in milliseconds) for the KSM daemon to sleep between scanning batches. It controls the scanning frequency, balancing KSM's memory merging process with system performance.

    6. static unsigned int ksm_thread_pages_to_scan = 100;

      This variable determines the number of pages the KSM daemon will scan in one batch. It controls the performance and efficiency of KSM's scanning process, balancing between thoroughness and system load.

    7. static int ksm_max_page_sharing = 256;

      This variable controls the maximum number of page slots that can share a stable node in the stable tree for KSM. It defines a limit for page sharing, affecting how KSM consolidates memory pages.

    8. static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;

      This variable sets the delay (in milliseconds) for pruning stale stable_node_dups in the stable_node_chains. It controls how often KSM will perform cleanup operations on stale nodes.

    1. /* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true;

      The code includes a policy for soft offline pages (MF_SOFT_OFFLINE). This is a feature where the kernel attempts to migrate pages to avoid using faulty memory areas. The policy allows non-LRU movable pages (pages that aren’t in the Least Recently Used list) to be migrated.

    2. if (TestSetPageHWPoison(p)) { pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); if (flags & MF_COUNT_INCREASED) put_page(p); goto unlock_mutex; }

      The page isolation policy ensures that faulty pages are marked as hardware poisoned (using TestSetPageHWPoison). The kernel checks if a page has already been isolated (marked as poisoned) before taking any further action. This prevents faulty pages from being reused by the system and isolates them to avoid further corruption.

    1. /* * Should not even be attempting cluster allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP)) { VM_WARN_ON_ONCE(1); return 0; }

      The policy dictates that if huge page swapping is disabled at compile time (via the CONFIG_THP_SWAP configuration option), the kernel should not attempt to allocate swap clusters for huge pages and should fail the operation with a warning.

    2. #ifdef CONFIG_HIBERNATION

      If CONFIG_HIBERNATION is defined, the kernel includes code to write the entire system memory state to the swapfile before powering down the system. This involves allocating swap slots for the entire memory state and ensuring that the data is properly stored.

    3. if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL);

      This policy optimizes memory usage by attempting to free both the swap entry and the page cache when they are no longer needed. The system avoids keeping cached pages in memory unnecessarily, thus freeing up resources for other processes.

    4. /* * Use percpu scan base for SSD to reduce lock contention on * cluster and swap cache. For HDD, sequential access is more * important. */ if (si->flags & SWP_SOLIDSTATE) scan_base = this_cpu_read(*si->cluster_next_cpu); else scan_base = si->cluster_next; offset = scan_base;

      Algorithmic Policy: Check if the device is HDD or SSD. If SSD, allow different cpus to scan different parts of the swap map. If HDD, check for slots sequentially.

    5. /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; goto scan; /* check next one */ }

      This is an algorithmic policy. If you're running out of swap spaces, try to reclaim space in the cache-only swap.

    6. #define LATENCY_LIMIT 256

      LATENCY_LIMIT is an upper bound on how many slots can be checked by scan_swap_map_slots before it yields the cpu back. It's set to 64. This policy prevents the function from monopolizing the cpu.

    7. node = numa_node_id();

      This algorithmic policy is choosing which node to look at for swap devices.

      In particular, it only tries swapping pages with devices that are at the same Numa Node as the core, and not any other.

    8. plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);

      This is an algorithmic policy that decides which of the swap devices should be used to get swap pages from, in that numa node. What's interesting here is that once a device is seen, it is requeued, so that the burden is spread evenly over all the devices with same priority, on that that node.

    9. n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);

      SWAP_BATCH: This is a configuration policy (Macro) that caps the maximum number of entries that can be swapped in a single operation. It's set to 64. This is done to limit amount of swapping that happens.

    1. if (!sysctl_compaction_proactiveness) timeout = MAX_SCHEDULE_TIMEOUT;

      If proactive compaction is disabled (sysctl_compaction_proactiveness is 0), the daemon sets its timeout to MAX_SCHEDULE_TIMEOUT, effectively sleeping until explicitly woken up. If proactive compaction is enabled, it uses the default timeout to periodically wake up and check for work.

    1. if (movable_node_is_enabled()) {

      This policy, as the comment states, will ignore the kernelcore and movablecore options if movable nodes are enabled (skipping the logic below this if statement and jumping to "out2" instead). The logic for the "movable_node_is_enabled()" function is in "memory_hotplug.h".

    2. static bool __meminit defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; if (early_page_ext_enabled()) return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. */ if (prev_end_pfn != end_pfn) { prev_end_pfn = end_pfn; nr_initialised = 0; } /* Always populate low zones for address-constrained allocations */ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. */ nr_initialised++; if ((nr_initialised > PAGES_PER_SECTION) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { NODE_DATA(nid)->first_deferred_pfn = pfn; return true; } return false; }

      This policy decides whether to defer the remaining initialization; it's used on line 885 of this page. Note that this function definition relies on CONFIG_DEFERRED_STRUCT_PAGE_INIT being defined.

    1. if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ if (kasan_has_integrated_init()) init = false; } else { /* * If memory tags have not been set by KASAN, reset the page * tags to ensure page_address() dereferencing does not fault. */ for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); }

      This decision made by configuration (KASAN's mode)

    2. if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask))

      const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;

      The value of PAGE_ALLOC_COSTLY_ORDER is defined as 3, heuristically determining whether try direct compaction or not. However, can_direct_reclaim is determined by caller, so it is also a configuaration policy.

    3. if (!node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; } for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) continue; /* Use the distance array to find the distance */ val = node_distance(node, n); /* Penalize nodes under us ("prefer the next node") */ val += (n < node); /* Give preference to headless and unused nodes */ if (!cpumask_empty(cpumask_of_node(n))) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ val *= MAX_NUMNODES; val += node_load[n]; if (val < min_val) { min_val = val; best_node = n; } } if (best_node >= 0) node_set(best_node, *used_node_mask);

      Selects the best node based on a heuristic that takes into account node distance, CPU availability, and load. The code prefers unused nodes, penalizes closer nodes, and gives preference to less-loaded nodes. It then updates the used node mask to prevent reuse of the same node.

    4. /* * !costly requests are much more important than * __GFP_RETRY_MAYFAIL costly ones because they are de * facto nofail and invoke OOM killer to move on while * costly can fail and users are ready to cope with * that. 1/4 retries is rather arbitrary but we would * need much more detailed feedback from compaction to * make a better decision. */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; if (++(*compaction_retries) <= max_retries) { ret = true; goto out; }

      Adjusts the maximum number of retries for compaction based on allocation order. When handling high-order allocations, the retries are reduced to 1/4 to avoid overcommitting resources. This is a heuristic decision.

    5. if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) *no_progress_loops = 0; else (*no_progress_loops)++;

      Decision on whether to reset or increment no_progress_loops based on memory reclaim progress and allocation order. The progress is determined at runtime, with PAGE_ALLOC_COSTLY_ORDER serving as a threshold, representing a heuristic decision.

    6. /* * Do not steal pages from freelists belonging to other pageblocks * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order;

      This prevents fragmentation by avoiding small allocations to not cause excessive fragmentation in memory. With ALLOC_NOGRAGMENT flag is set then the kernel will only allocate from larger pageblocks.

    7. /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) clear_highpage_kasan_tagged(page + i); kasan_enable_current();

      The entire functions clears memory pages but while avoiding the KASAN redzone. This redzones are used to detect memory corruption. After the clearing, the KASAN is re-enabled.

    8. /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. */ if (unlikely(order)) { bool compound = PageCompound(page); int i; VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); if (compound) page[1].flags &= ~PAGE_FLAGS_SECOND;

      This policy handles compound/big pages. Like what the comment mentions, ensuring that the order matches and that each tail page is handled accordingly. This algorithmic policy prevents possible issues when trying to free large memory blocks.

    9. if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); movable = migratetype == MIGRATE_MOVABLE; return NR_LOWORDER_PCP_LISTS + movable; }

      Manages memory pages efficiently in the buddy allocator by organizing them in page of their migratetype and order. It ensures that by organizing by types and sizes it can be easily found during allocation and deallocation.

    10. /* * Temporary debugging check for pages not lying within a given zone. */ static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; if (zone != page_zone(page)) return 1; return 0; }

      Policy that ensures that pages are assigned on the correct zones. This should be within the kernel's memory management system. Checks for potential corruption of mismanagement of pages. Enforces constraints.

    11. int ratio = sysctl_lowmem_reserve_ratio[i]; bool clear = !ratio || !zone_managed_pages(zone); unsigned long managed_pages = 0; for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; managed_pages += zone_managed_pages(upper_zone); if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; }

      This is a configuration policy since it revolves around the "sysctl_lowmem_reserve_ratio" which is a configurable setting that can be changed dynamically at runtime. The for loop applies the logic of the configuration but the parameter controls how much memory to reserve. The calculation of memory reserves based on this ratio are configuration policies.

    12. if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype);

      The decision policy checks if the free page count is updated to reflect the allocation of deallocation of memory blocks of a migratype (if the page is movable, unmovable, or reclaimable).

    13. static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ free_page_is_bad_report(page); return true; }

      This policy checks the state of a page to verify if it is in the expected state. If in expected state returns false and ca be freed normally. The policy decides whether to report and handle a bad page based on its state.

    14. if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); }

      This watermark policy that determines when to ignore the watermark boost for certain types of memory allocations. Boosting is used to temporarily increase the free memory threshold, but this policy decides when the boost should be ignored.

    15. bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; int o; /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); if (unlikely(alloc_flags & ALLOC_RESERVES)) { /* * __GFP_HIGH allows access to 50% of the min reserve as well * as OOM. */ if (alloc_flags & ALLOC_MIN_RESERVE) { min -= min / 2; /* * Non-blocking allocations (e.g. GFP_ATOMIC) can * access more reserves than just __GFP_HIGH. Other * non-blocking allocations requests such as GFP_NOWAIT * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get * access to the min reserve. */ if (alloc_flags & ALLOC_NON_BLOCK) min -= min / 4; } /* * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; } /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ if (!order) return true; /* For a high-order request, check at least one suitable page is free */ for (o = order; o < NR_PAGE_ORDERS; o++) { struct free_area *area = &z->free_area[o]; int mt; if (!area->nr_free) continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; } } return false; }

      Algorithmic policy for memory allocation. Checks if there are enough free pages for an allocation request based on a watermark-based allocation policy criteria. Is is tone by checking the number of free pages in the zone to a threshold "mark".

    1. if (prev_class) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } }

      This is an algorithmic policy. A zs_pool maintains zs_pages of different size_class. However, some size_classes share exactly same characteristics, namely pages_per_zspage and objs_per_zspage. Recall the other annotation of mine, it searched free zspages by size_class: zspage = find_get_zspage(class);. Thus, grouping different classes improves memory utilization.

    2. zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, ZS_OBJS_INUSE, 1); goto out; }

      This is an algorithmic policy. Instead of immediately allocating new zspages for each memory request, the algorithm first attempts to find and reuse existing partially filled zspages from a given size class by invoking the find_get_zspage(class) function. It also updates the corresponding fullness groups.

    1. if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; }

      This is an algorithmic policy that determines whether to use a more efficient vl_alloc which does not involve complex virtual-to-physical mappings. Unlike the latter alloc_vmap_area, vb_alloc does not need to traverse the rb-tree of free vmap areas. It simply find a larger enough block from vmap_block_queue.

    1. /* * If the user wants hardware cache aligned objects then follow that * suggestion if the object is sufficiently large. * * The hardware cache alignment cannot override the specified * alignment though. If that is greater then use it. */ if (flags & SLAB_HWCACHE_ALIGN) { unsigned int ralign; ralign = cache_line_size(); while (size <= ralign / 2) ralign /= 2; align = max(align, ralign); }

      This SLAB_HWCACHE_ALIGN is defined by the user, making users decide whether hardware cache should align objects or not.

    1. size = count_history_pages(mapping, index, max);

      This uses the size of contigiously cached pages in cache to get a "conservative" estimation of the "length of a sequential read sequence" or the "thrashing threshold in memory tight systems". LDOS could use different policies to predict the optimal read ahead size.

    2. unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE

      This chunks the read-ahead into 2 megabyte units to avoid pinning too much memory at once. LDOS could replace this with a dynamically-sized chunk to better optimize for other use cases.

    1. static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) { int m_nid = memblock_get_region_node(m); /* we never skip regions when iterating memblock.reserved or physmem */ if (type != memblock_memory) return false; /* only memory regions are associated with nodes, check it */ if (nid != NUMA_NO_NODE && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; /* skip driver-managed memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; return false; }

      This policy determines whether a memblock region should be skipped, based on several checks that incorporate various flags. You can see this policy being used in other functions on lines 1080 and 1184 in this file; these other functions appear to be sub-functions for iterators on the memblock regions.

  2. Sep 2024
    1. if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; }

      It is a configuration policy. The policy here is to adjust the cost of current page. The cost means the overhead for kernel to operate the page if the page is swapped out. Kernel adopts a decay policy. Specifically, if current cost is greater than lrusize/4, its cost will be divided by 2. If kernel has no this policy, after a long-term running, the cost of each page is very high. Kernel might mistakenly reserve pages which are frequently visited long time ago but are inactive currently. It might cause performance degradation. The value (lrusize/4) here has a trade-off between performance and sensitivity. For example, if the value is too small, kernel will frequently adjust the priority of each page, resulting in process's performance degradation. If the value is too large, kernel might be misleaded by historical data, causing wrongly swapping currently popular pages and further performance degradation.

    2. if (megs < 16) page_cluster = 2; else page_cluster = 3;

      It is a configuration policy. The policy here is to determine the size of page cluster. "2" and "3" is determined externally to the kernel. Page cluster is the actual execution unit when swapping. If the machine is small-memory, kernel executes swap operation on 4 (2^2) pages for each time. Otherwise, kernel operats 8 (2^3) for each time. The rational here is to avoid much memory pressure for small-memory system and to improve performance for large-memory system.

    1. schedule_timeout_uninterruptible(1);

      It is a configuration policy. If kernel cannot allocate an enough virtual space with alignment, while nofail is specified to disallow failure, the kernel will let the process to sleep for 1 time slot. In this period, the process cannot be interrupted and quit the schedule queue of CPU. The "1" here is a configuration parameter, made externally to the kernel. It is not large enough for latency-sensitive process and is not small enough to retry frequently.

    2. if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); }

      It is an algorithmic policy and also a configuration policy. The algorithmic policy here is to maxmize the data locality of virtual memory address, by letting the space be in continuous virtual pages. If the demanded size of virtual memory is larger than one page, the kernel will allocate multiple continuous pages for it by using __vmalloc_node(). Otherwise, the kernel will call kmalloc_node() to place it in a single page. On the other hand, to allocate continuous pages, we should invoke __vmalloc_node() by declaring align = 1, which is a configuration policy.

    3. if (!counters) return; if (v->flags & VM_UNINITIALIZED) return;

      It is an algorithmic policy. The show_numa_info() is used for printing how the virtual memory of v (vm_struct) distributes its pages across numa nodes. The two if conditions are used for filtering the invalid v. The first if means the virtual memory has not been associated with physical page. The second if means the virtual memory has not been initialized yet.

    4. if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length);

      It is an algorithmic policy. The policy here is to avoid lock/unlock overhead by using copy_page_to_iter_nofault() function plus if-else branch. Because copy_page_to_iter_nofault() is based on no page fault, we need to check the physical page is still alive. So the kernel uses an if condition to guarantee copy_page_to_iter_nofault() without page fault. On the other hand, if the page is not valid, kernel chooses to fill the iter using zero value instead of returning an error. I think it is because it can make invocation more convenient. The caller does not need to do a verification of the return value and does not need further actions to handle it.

    5. if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow;

      It is an algorithmic policy. The two if conditions are used for checking the feasibility of the allocation. The first if means the adress of the allocation is beyond the allowable address, causing overflow. In this case, since the base address is decided from higher one to smaller one, reducing base address cannot work. Note that the base address is dependent on the va. The second if means we don't find a valid va which has appropriate base address. So the policy here is going to overflow branch to re-allocate va list.

    6. if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; }

      It is an algorithmic policy. It is used for checking whether the start address of current va is valid. The policy here is to change a new va, different from the solution of "end address check" (explained in my other annotation). The reason is that we find the base address from high address to low address. If we continue to reduce the base address, the if condition will be always wrong. Changing a new va with a lower start address is the only solution.

    7. if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; }

      It is an algorithmic policy. The outer loop is an endless loop until finding a vaild virtual memory space satisfying the requirements. The if condition here is used to guarantee the space in va is large enough. If not, we continue to scan the memory space from high address to low address while satisfying the alignment requirement. Specifically, tuning the base address to a lower address. And then, it will retry for verification.

    8. va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue;

      It is an algorithmic policy. The for loop is used for transferring the virtual memory managment by using from vm_struct (link list) to vm_area (tree?). Inside the for loop, if we cannot allocate a new vmap_area from vmap_area_cachep (va = NULL), the program will simply print a warning and skip current tmp, instread of retry or anything to guarantee the integrity.

    9. if (!spin_trylock(&vmap_area_lock)) return false;

      It is an algorithmic policy. The vmalloc_dump_obj() function is used for printing the information of specified virtual memory. Due to concurrency, before manipulating the critical state, we must obtain the lock. The policy here is spin_trylock. It only tries once. If fails, the function will simply returns false. I think the policy here is to avoid long-term waiting to improve the performance.

    1. static inline unsigned int calc_slab_order(unsigned int size, unsigned int min_order, unsigned int max_order, unsigned int fract_leftover) { unsigned int order; for (order = min_order; order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; } return order; }

      Code to choose how many pages to allocate for a new slab to minimize wasted space from the remainder

    1. if (folio) { pgoff_t start; rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit;

      Read ahead policy for marked folios

  3. Aug 2024
  4. Jul 2024
  5. Jun 2024
    1. /* * The larger the object size is, the more slabs we want on the partial * list to avoid pounding the page allocator excessively. */ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);

      A policy decision about how often we may have to go to the page allocator.

    2. /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 static int calculate_sizes(struct kmem_cache *s) {

      computes a several values for the allocator based on the size and flags of the allocator being created.

    3. static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; unsigned int max_objects; unsigned int min_order; min_objects = slub_min_objects; if (!min_objects) {

      calculate the order (power of two number of pages) that each slab in this allocator should have.

    4. max_objects = max(order_objects(slub_max_order, size), 1U); min_objects = min(min_objects, max_objects); min_order = max_t(unsigned int, slub_min_order, get_order(min_objects * size)); if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1;

      Policy calculation

    5. ALLOC_FASTPATH, /* Allocation from cpu slab */ 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ ALLOC_SLAB, /* Cpu slab acquired from page allocator */ ALLOC_REFILL, /* Refill cpu slab from slab freelist */ ALLOC_NODE_MISMATCH, /* Switching cpu slab */ FREE_SLAB, /* Slab freed to the page allocator */ CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS

      Possible features

    1. int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ /* * The threshold scales with the number of processors and the amount * of memory per zone. More memory means that we can defer updates for * longer, more processors could lead to more contention. * fls() is used to have a cheap way of logarithmic scaling. * * Some sample thresholds: * * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 * 20 2 2 1-2 GB 5 * 24 2 2 2-4 GB 6 * 28 2 2 4-8 GB 7 * 32 2 2 8-16 GB 8 * 4 2 2 <128M 1 * 30 4 3 2-4 GB 5 * 48 4 3 8-16 GB 8 * 32 8 4 1-2 GB 4 * 32 8 4 0.9-1GB 4 * 10 16 5 <128M 1 * 40 16 5 900M 4 * 70 64 7 2-4 GB 5 * 84 64 7 4-8 GB 6 * 108 512 9 4-8 GB 6 * 125 1024 10 8-16 GB 8 * 125 1024 10 16-32 GB 9 */ mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); /* * Maximum threshold is 125 */ threshold = min(125, threshold); return threshold; }

      a "magic" formula for computing the amount of memory per zone.

    1. khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

      bunch of "magic" numbers for hugepaged

  6. May 2024
    1. 40.45 KiB Blame Edit Open in Web IDE . Quickly and easily edit multiple files in your project. Edit single file Edit this file only. Lock Replace Delete 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 // SPDX-License-Identifier: GPL-2.0-only

      Debug only

    1. debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); debugfs_create_file("base_pfn", 0444, tmp, &cma->base_pfn, &cma_debugfs_fops); debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops); debugfs_create_file("order_per_bit", 0444, tmp, &cma->order_per_bit, &cma_debugfs_fops); debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops); debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops); cma->dfs_bitmap.array = (u32 *)cma->bitmap; cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));

      feature data

    1. bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies;

      Initial config settings

    2. static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, };

      Policy parameter data structure

    3. static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; }

      Policy function to set parameters

    4. static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; }

      Policy function to set parameters

    5. static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count;

      Policy function to set parametesr

    6. static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; }

      Policy function to set paraameters

    7. static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; }

      Policy function to set parameters

    8. static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; }

      Policy function to set parameters

    9. unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh;

      Possible features