#!/usr/bin/perl
# This script at it's basic usage replicates the information available from
# /proc/buddyinfo. Optionally, it can account for LRU pages as if they were
# free to give a better estimate of the current state of fragmentation in
# the system.
#
# Licensed under LGPL 2.1 as packaged with libhugetlbfs
# (c) Mel Gorman 2009

use FindBin qw($Bin);
use lib "$Bin";

use Fcntl;
use Pod::Usage;
use Getopt::Long;
use FA::Report;
use strict;
use constant KPF_BYTES         => 8;
use constant KPF_LRU           => (1<<5);
use constant KPF_BUDDY         => (1<<10);
use constant KPF_COMPOUND_HEAD => (1<<15);
use constant KPF_COMPOUND_TAIL => (1<<16);
use constant KPF_HUGE          => (1<<17);
use constant KPF_NOPAGE        => (1<<20);
use constant KPF_HIGH_RESERVED => (1);

my $p = "show-buddyinfo";
my ($opt_lru, $opt_huge, $opt_pagemap, $opt_proc, $opt_verbose);
my ($opt_man, $opt_help);
my $opt_maxorder = -1;
my %node_pfn_ranges;
my $little_endian = 0;
my $packmask = "C*";

GetOptions(
	'verbose|v'		=> \$opt_verbose,
	'use-pagemap|p'		=> \$opt_pagemap,
	'use-proc|b'		=> \$opt_proc,
	'lru-as-free|l'		=> \$opt_lru,
	'huge-as-free|s'	=> \$opt_huge,
	'max-order|m=n'         => \$opt_maxorder,
	'help|h'		=> \$opt_help,
	'man|m'			=> \$opt_man,
	);

# Print usage if requested
pod2usage(-exitstatus => 0, -verbose => 0) if $opt_help;
pod2usage(-exitstatus => 0, -verbose => 2) if $opt_man;

setVerbose if $opt_verbose;
$opt_pagemap = 1 if $opt_lru;
$opt_pagemap = 1 if $opt_huge;

if (unpack("h*", pack("s", 1)) =~ /01/) {
	$little_endian = 1;
}

# Simple case - just print out /proc/buddyinfo to show really free pages
if (!$opt_pagemap || $opt_proc) {
	#printVerbose("$p\::dumping /proc/buddyinfo\n");
	open(BUDDYINFO, "/proc/buddyinfo") || die("Failed to open /proc/buddyinfo");
	while (!eof BUDDYINFO) {
		print <BUDDYINFO>;
	}
	close BUDDYINFO;

	if (!$opt_pagemap) {
		exit(0);
	}
}

# Determine what maxorder is
if ($opt_maxorder == -1) {
	open(BUDDYINFO, "/proc/buddyinfo") || die("Failed to open /proc/buddyinfo");
	my $line = <BUDDYINFO>;
	my @info = split(/\s+/, $line);
	$opt_maxorder = $#info-4;
	close BUDDYINFO;
	#printVerbose("$p\::Setting default for max order: $opt_maxorder\n");
}

# First, establish what the range of page frames is
my $this_node;
my $this_zone = "";
my $this_start_pfn;
my $this_spanned_pfn;
my $KPAGEFLAGS;
open(ZONEINFO, "/proc/zoneinfo") || die("Failed to open /proc/zoneinfo");
while (!eof(ZONEINFO)) {
	my $line = <ZONEINFO>;

	# Check for the beginning of a new zone
	if ($line =~ /^Node ([0-9]+), zone\s+([a-zA-Z]+)/) {
		if ($this_zone ne "") {
			$node_pfn_ranges{$this_node}->{$this_zone} = $this_start_pfn . "-" . ($this_start_pfn + $this_spanned_pfn);
			#printVerbose("$p\::Node $this_node Zone $this_zone Start $this_start_pfn Spanned $this_spanned_pfn\n");
		}
		$this_node = $1;
		$this_zone = $2;
	}

	if ($line =~ /spanned\s+([0-9]+)/) {
		$this_spanned_pfn = $1;
	}

	if ($line =~ /start_pfn:\s+([0-9]+)/) {
		$this_start_pfn = $1;
	}
}

if ($this_zone ne "") {
	$node_pfn_ranges{$this_node}->{$this_zone} = $this_start_pfn . "-" . ($this_start_pfn + $this_spanned_pfn);
	#printVerbose("$p\::Node $this_node Zone $this_zone Start $this_start_pfn Spanned $this_spanned_pfn\n");
}
close(ZONEINFO);

my $pfn_flags;
my $low_flags;
my @bytes;
sub readPageFlags {

	# Read the page flags information for this PFN
	sysread(KPAGEFLAGS, $pfn_flags, 8) || die("read: $!");
	@bytes = unpack $packmask, $pfn_flags;
	if ($little_endian) {
		$low_flags = $bytes[7] | ($bytes[6] << 8) | ($bytes[5] << 16) | ($bytes[4] << 24);
	} else {
		$low_flags = $bytes[0] | ($bytes[1] << 8) | ($bytes[2] << 16) | ($bytes[3] << 24);
	}

	#my $high_flags = 0;
	#$high_flags = $bytes[4] |
	#		($bytes[5] << 8) |
	#		($bytes[6] << 16) |
	#		($bytes[7] << 24);
}

sysopen(KPAGEFLAGS, "/proc/kpageflags", O_RDONLY) || die("Failed to open /proc/kpageflags: $!");
foreach $this_node (keys %node_pfn_ranges) {
	foreach $this_zone (keys %{$node_pfn_ranges{$this_node}}) {
		my @free_blocks;
		my ($pfn, $page_idx);
		my ($start_pfn, $end_pfn) = split '-', $node_pfn_ranges{$this_node}->{$this_zone};
		my @memmap = ();
		$page_idx = 0;
		#printVerbose("$p: Processing node $this_node zone $this_zone\n");

		# Read each PFN in the zone
		sysseek(KPAGEFLAGS, KPF_BYTES * $start_pfn, 0) || die("seek: $!");
		for ($pfn = $start_pfn; $pfn < $end_pfn; $pfn++, $page_idx++) {

			sysread(KPAGEFLAGS, $pfn_flags, 8) || die("read: $!");
			@bytes = unpack $packmask, $pfn_flags;
			if ($little_endian) {
				$low_flags = $bytes[7] | ($bytes[6] << 8) | ($bytes[5] << 16) | ($bytes[4] << 24);
			} else {
				$low_flags = $bytes[0] | ($bytes[1] << 8) | ($bytes[2] << 16) | ($bytes[3] << 24);
			}
process_thispage:
			last if ($pfn == $end_pfn);

			# Sanity checks
			#if ($pfn - $start_pfn != $page_idx) {
			#	die("pfn and page_idx out of sync\n");
			#}

			# Treat LRU pages as free if requested
			if ($opt_lru && ($low_flags & KPF_LRU)) {
				#printVerbose("LRU page $pfn($page_idx)\n");
				$memmap[$page_idx] = 0;
				next;
			}

			# Treat huge pages as free if requested
			if ($opt_huge && ($low_flags & KPF_HUGE)) {
				my $contig_free = 0;
				my $orig_pfn = $pfn;
				do {
					$contig_free++;
					$pfn++;
					last if ($pfn == $end_pfn);
					readPageFlags();
					$memmap[$page_idx] = -1;
				} while ($low_flags & KPF_HUGE);

				my $order = log($contig_free)/log(2);
				$memmap[$page_idx] = $order;
				$page_idx += $contig_free;
				#printVerbose("Huge page $orig_pfn($page_idx) order $order\n");

				# We are on a new page, this avoids a seek
				goto process_thispage;
			}

			# Treat compound pages that are very large as huge
			if ($opt_huge && ($low_flags & KPF_COMPOUND_HEAD)) {
				my $contig_free = 0;
				my $orig_pfn = $pfn;
				do {
					$contig_free++;
					$pfn++;
					last if ($pfn == $end_pfn);
					readPageFlags();
					$memmap[$page_idx] = -1;
				} while ($low_flags & KPF_COMPOUND_TAIL);

				my $order = log($contig_free)/log(2);
				if ($order > 6) {
					$memmap[$page_idx] = $order;
					#printVerbose("Compound as huge $orig_pfn($page_idx) order $order\n");
				}
				$page_idx += $contig_free;

				# We are on a new page, this avoids a seek
				goto process_thispage;
			}

			# Treat buddy pages as free
			if ($low_flags & KPF_BUDDY) {
				#printVerbose("Buddy page $pfn($page_idx)\n");
				do {
					$memmap[$page_idx] = 0;

					# Move to next page
					$pfn++;
					$page_idx++;
					last if ($pfn == $end_pfn);
					readPageFlags();
				} while ($low_flags == 0);

				# We are on a new page, this avoids a seek
				goto process_thispage;
			}

			# Otherwise, treat page as not free
			$memmap[$page_idx] = -1;
		}

		# Buddy coalesce
		for ($page_idx = 0; $page_idx < $end_pfn - $start_pfn; $page_idx++) {
			if ($memmap[$page_idx] >= 0) {
				my $order = $memmap[$page_idx];
				my $this_idx = $page_idx;

				for ($order; $order < $opt_maxorder; $order++) {
					my $buddy_idx = $this_idx ^ (1 << $order);
					my $low_idx = $this_idx & ~(1 << $order);

					# If these are buddies, merge
					if ($memmap[$buddy_idx] == $order && $memmap[$this_idx] == $memmap[$buddy_idx]) {
						$memmap[$this_idx] = -1;
						$memmap[$buddy_idx] = -1;
						$memmap[$low_idx] = $order + 1;
					} else {
						$order = $opt_maxorder;
					}

					$this_idx = $low_idx;
				}
			}
		}

		# Count up free blocks in the system
		for ($page_idx = 0; $page_idx < $end_pfn - $start_pfn; $page_idx++) {
			my $order = $memmap[$page_idx];
			if ($order >= 0) {
				$free_blocks[$order]++;
			}
		}

		# Print out the buddyinfo
		my $order;
		printf("Node %d, zone %8s ", $this_node, $this_zone);
		for ($order = 0; $order <= $opt_maxorder; $order++) {
			printf("%6lu ", $free_blocks[$order]);
		}
		printf("\n");
	}
}
close KPAGEFLAGS;

# Below this line is help and manual page information __END__
=head1 NAME
show-buddyinfo - Show information about free pages in the system

=head1 SYNOPSIS

show-buddyinfo [options]

 Options:
    -b, --use-proc     Print information based on /proc/buddyinfo
    -p, --use-pagemap  Print information based on /proc/kpageflags
    -l, --lru-as-free  Treat LRU pages as if they were free (requires pagemap)
    -h, --huge-as-free Treat hugepage as if they were free (requires pagemap)
    -v, --verbose      Print debugging information
    --help             Print help messages
    --man              Print man page

=head1 OPTIONS

=over 8

=item B<--help>

Print a help message and exit

=item B<--use-proc, -b>

Print verbatim the information stored in /proc/buddyinfo

=item B<--use-pagemap, -p>

Analyse physical memory using /proc/kpageflags and figure out the number
of pages that are free based on the information in it. It'll be always
slightly out of sync with /proc/buddyinfo as that information is based on
counters stored about the free-list where as analysing /proc/kpageflags is
inherently racy and takes longer. The results should be comparable though.

=item B<--lru-as-free, -l>

Treat pages on the LRU lists as if they were free. This is principally of
use when working out the state of external fragmentation in the system
and if reclaiming all pages in the system would likely help high-order
allocations or not.

=item B<--huge-as-free, -h>

Treat huge pages as if they were free. This is principally of use when it
is known that the hugepages are being stored in reserve on private free
lists for use by hugepage aware applications.

=back

=head1 DESCRIPTION

This tool displays what pages are free at what orders throughout the
system. Optionally, it can treat LRU pages as if they were free. Note that in
the current implementation, LRU pages being treated as free are not properly
merged as buddies. This means that pages could be merged inappropriately
for a buddy allocator but it's considered acceptable considering how racy
the tool is and that the information is only approximate at best.

=head1 AUTHOR

Written by Mel Gorman (mel@csn.ul.ie)

=head1 REPORTING BUGS

Report bugs to the author

=cut
