#!/usr/bin/env bpftrace // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019 Red Hat, Inc. * * @nsec_alloc_pages_latency is the page allocation latency for the * system in nanoseconds (except for khugepaged where it's measured * separately). It's indexed by the page size in byes. The base page * size is 4096 bytes. The THP page size is 2097152. Other * intermediate page sizes used for kernel stack if VMAP_STACK=n or by * drivers can show up too. It's interesting to compare the latency * between the base page size and the THP page size. Unless the app * uses MADV_HUGEPAGE, if enabled=always and defrag=madvise, the * nsec_alloc_pages_latency of THP and base size pages should be * comparable. * * @nsec_khugepaged_alloc_latency is the same as * @nsec_alloc_pages_latency, but it shows the page allocation latency * only for khugepaged. The high latency caused by defragmenting * memory will only be paid by khugepaged in the background. This * latency does not interrupt the runtime of any app. * * @sec_khugepaged_write_mmap_sem is the time in nanosec khugepaged * will hold the mmap_sem (for writing) of an app while it collapses * the hugepage. During this time syscalls like mmap and page faults * may hang in the context of the app. So this time could result in * app latencies and it should remain low. This cannot be measured * accurately without more tracepoints. In the current version the * measurement is a worst case because it also includes the time * khugepaged has to wait to obtain the mmap_sem. * * @none_or_zero is the number of empty or zero mappings the page had * during the collapse of a THP. This number varies from 0 to 511. The * more hits there are near 511, the more memory utilizaton increases * because of background khugepaged activity. * "echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none" * will enforce this number to remain zero and it will avoid the increased * memory usage with THP enabled=always. * * @unmapped has the same semantics of @none_or_zero, but it applies * to /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap and * controls how many pages have been swapped in, that weren't in RAM * in order to collapse the hugepage with khugepaged. The higher the * number the higher the RAM utilization. */ kprobe:__alloc_pages_nodemask { $level = @alloc_pages_nodemask[tid]; if (!@time[tid,arg1]) { @args[tid,$level] = arg1 + 1; @time[tid,arg1] = nsecs; } @alloc_pages_nodemask[tid] += 1; } kretprobe:__alloc_pages_nodemask /@alloc_pages_nodemask[tid]/ { @alloc_pages_nodemask[tid] -= 1; $level = @alloc_pages_nodemask[tid]; @alloc_pages_nodemask[tid] = $level; if (@args[tid,$level]) { $order = @args[tid,$level] - 1; if (comm == "khugepaged") { @nsec_khugepaged_alloc_latency[4096<<$order] = hist(nsecs - @time[tid,$order]); } else { @nsec_alloc_pages_latency[comm,4096<<$order] = hist(nsecs - @time[tid,$order]); } delete(@time[tid,$order]); delete(@args[tid,$level]); if ($level == 0) { delete(@alloc_pages_nodemask[tid]); } } } /* * mmap_sem for writing is taken after mm_collapse_huge_page_swapin * and is released before tracepoint:huge_memory:mm_collapse_huge_page. */ tracepoint:huge_memory:mm_collapse_huge_page_swapin { @time_khugepaged[tid] = nsecs; } tracepoint:huge_memory:mm_collapse_huge_page /@time_khugepaged[tid]/ { @nsec_khugepaged_write_mmap_sem = hist(nsecs - @time_khugepaged[tid]); } tracepoint:huge_memory:mm_khugepaged_scan_pmd { if (args->status == 1) { @none_or_zero = hist(args->none_or_zero); @unmapped = hist(args->unmapped); } } END { clear(@time); clear(@args); clear(@alloc_pages_nodemask); clear(@time_khugepaged); }