#!/usr/bin/env bpftrace
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2019  Red Hat, Inc.
 *
 * @nsec_alloc_pages_latency is the page allocation latency for the
 * system in nanoseconds (except for khugepaged where it's measured
 * separately). It's indexed by the page size in byes. The base page
 * size is 4096 bytes. The THP page size is 2097152. Other
 * intermediate page sizes used for kernel stack if VMAP_STACK=n or by
 * drivers can show up too. It's interesting to compare the latency
 * between the base page size and the THP page size. Unless the app
 * uses MADV_HUGEPAGE, if enabled=always and defrag=madvise, the
 * nsec_alloc_pages_latency of THP and base size pages should be
 * comparable.
 *
 * @nsec_khugepaged_alloc_latency is the same as
 * @nsec_alloc_pages_latency, but it shows the page allocation latency
 * only for khugepaged. The high latency caused by defragmenting
 * memory will only be paid by khugepaged in the background. This
 * latency does not interrupt the runtime of any app.
 *
 * @sec_khugepaged_write_mmap_sem is the time in nanosec khugepaged
 * will hold the mmap_sem (for writing) of an app while it collapses
 * the hugepage. During this time syscalls like mmap and page faults
 * may hang in the context of the app. So this time could result in
 * app latencies and it should remain low. This cannot be measured
 * accurately without more tracepoints. In the current version the
 * measurement is a worst case because it also includes the time
 * khugepaged has to wait to obtain the mmap_sem.
 *
 * @none_or_zero is the number of empty or zero mappings the page had
 * during the collapse of a THP. This number varies from 0 to 511. The
 * more hits there are near 511, the more memory utilizaton increases
 * because of background khugepaged activity.
 * "echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none"
 * will enforce this number to remain zero and it will avoid the increased
 * memory usage with THP enabled=always.
 *
 * @unmapped has the same semantics of @none_or_zero, but it applies
 * to /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap and
 * controls how many pages have been swapped in, that weren't in RAM
 * in order to collapse the hugepage with khugepaged. The higher the
 * number the higher the RAM utilization.
 */

kprobe:__alloc_pages_nodemask
{
	$level = @alloc_pages_nodemask[tid];
	if (!@time[tid,arg1]) {
		@args[tid,$level] = arg1 + 1;
		@time[tid,arg1] = nsecs;
	}

	@alloc_pages_nodemask[tid] += 1;
}

kretprobe:__alloc_pages_nodemask /@alloc_pages_nodemask[tid]/
{
	@alloc_pages_nodemask[tid] -= 1;

	$level = @alloc_pages_nodemask[tid];
	@alloc_pages_nodemask[tid] = $level;
	if (@args[tid,$level]) {
		$order = @args[tid,$level] - 1;
		if (comm == "khugepaged") {
			@nsec_khugepaged_alloc_latency[4096<<$order] = hist(nsecs - @time[tid,$order]);
		} else {
		   	@nsec_alloc_pages_latency[comm,4096<<$order] = hist(nsecs - @time[tid,$order]);
		}
		delete(@time[tid,$order]);
		delete(@args[tid,$level]);
		if ($level == 0) {
			delete(@alloc_pages_nodemask[tid]);
		}
	}
}

/*
 * mmap_sem for writing is taken after mm_collapse_huge_page_swapin
 * and is released before tracepoint:huge_memory:mm_collapse_huge_page.
 */
tracepoint:huge_memory:mm_collapse_huge_page_swapin
{
	@time_khugepaged[tid] = nsecs;
}

tracepoint:huge_memory:mm_collapse_huge_page /@time_khugepaged[tid]/
{
	@nsec_khugepaged_write_mmap_sem = hist(nsecs - @time_khugepaged[tid]);
}

tracepoint:huge_memory:mm_khugepaged_scan_pmd
{
	if (args->status == 1) {
		@none_or_zero = hist(args->none_or_zero);
		@unmapped = hist(args->unmapped);
	}
}

END {
    clear(@time);
    clear(@args);
    clear(@alloc_pages_nodemask);
    clear(@time_khugepaged);
}