#!perl

use strict;
use warnings;

use Getopt::Long qw(:config bundling no_ignore_case);
use POSIX qw(ceil);

our $AUTHORITY = 'cpan:PERLANCAR'; # AUTHORITY
our $DATE = '2022-05-20'; # DATE
our $DIST = 'App-freqtable'; # DIST
our $VERSION = '0.007'; # VERSION

my $tmp_freq;
my %Opts = (
    mode => 'line',
    ignore_case => 0,
    print_freq => 1,
    # XXX options to limit memory usage, e.g. max keys, max line length, --md5 (like in nauniq), ...
    min_freq => undef,
    max_freq => undef,
    sort_args => {},
    sort_sub => undef,
);

sub parse_cmdline {
    my $res = GetOptions(
        'bytes|c'   => sub { $Opts{mode} = 'byte' },
        'chars|m'   => sub { $Opts{mode} = 'char' },
        'words|w'   => sub { $Opts{mode} = 'word' },
        'lines|l'   => sub { $Opts{mode} = 'line' },
        'number|n'  => sub { $Opts{mode} = 'number' },
        'integer|i' => sub { $Opts{mode} = 'integer' },
        'ignore-case|f' => \$Opts{ignore_case},
        'no-print-freq|F' => sub { $Opts{print_freq} = 0 },
        'print-freq'      => sub { $Opts{print_freq} = 1 },
        'freq=s'    => \$tmp_freq,
        'sort-sub=s' => \$Opts{sort_sub},
        'sort-arg=s%' => $Opts{sort_args},
        'a'   => sub { $Opts{sort_sub} = 'asciibetically' },
        'help|h'  => sub {
            print <<USAGE;
Usage:
  freqtable [OPTIONS]... < INPUT
  freqtable --help (or -h)
Options:
  --bytes, -c
  --chars, -m
  --words, -w
  --lines, -l
  --number, -n
  --integer, -i
  --ignore-case, -f
  --no-print-freq, -F
  --freq N|M-N|M-|-N
  --sort-sub=SPEC
  --sort-arg=ARG=VAL
  -a
For more details, see the manpage/documentation.
USAGE
            exit 0;
        },
    );

    if (defined $tmp_freq) {
        if ($tmp_freq =~ /\A\d+\z/) {
            $Opts{min_freq} = $Opts{max_freq} = $tmp_freq;
        } elsif ($tmp_freq =~ /\A-(\d+)\z/) {
            $Opts{max_freq} = $1;
        } elsif ($tmp_freq =~ /\A(\d+)-\z/) {
            $Opts{min_freq} = $1;
        } elsif ($tmp_freq =~ /\A(\d+)-(\d+)\z/) {
            $Opts{min_freq} = $1;
            $Opts{max_freq} = $2;
        } else {
            warn "freqtable: Invalid value for --freq: '$tmp_freq', ".
                "please specify N|M-N|M-|-N\n";
            $res = 0;
        }
    }

    exit 99 if !$res;
}

sub run {
    my %occurences;
    my $numeric;

    if ($Opts{mode} eq 'byte' || $Opts{mode} eq 'char') {
        @ARGV = (\*STDIN) unless @ARGV;
        for my $fn (@ARGV) {
            my $fh;
            if (ref $fn) {
                $fh = $fn;
            } else {
                open $fh, "<", $fn or do {
                    warn "freqtable: Can't open '$fn': $!\n";
                    next;
                };
            }
            if ($Opts{mode} eq 'byte') {
                binmode $fh;
            } else {
                binmode $fh, ":encoding(utf8)";
            }
            while (1) {
                read $fh, my $block, 4096;
                last if !length $block;
                for (split //, $block) {
                    if ($Opts{ignore_case}) {
                        $occurences{lc $_}++;
                    } else {
                        $occurences{$_}++;
                    }
                }
            }
        } # fn
    } elsif ($Opts{mode} eq 'word') {
        while (defined(my $line = <>)) {
            chomp $line;
            while ($line =~ /(\w+)/g) {
                if ($Opts{ignore_case}) {
                    $occurences{lc $1}++;
                } else {
                    $occurences{$1}++;
                }
            }
        }
    } elsif ($Opts{mode} eq 'line') {
        while (defined(my $line = <>)) {
            chomp $line;
            if ($Opts{ignore_case}) {
                $occurences{lc $line}++;
            } else {
                $occurences{$line}++;
            }
        }
    } elsif ($Opts{mode} eq 'number' || $Opts{mode} eq 'integer') {
        $numeric++;
        while (defined(my $line = <>)) {
            my $num = $Opts{mode} eq 'integer' ? int($line) : $line + 0;
            $occurences{$num}++;
        }
    } else {
        die "freqtable: BUG: Unknown mode '$Opts{mode}'";
    }

    my $fmt;
    my @keys = keys %occurences;
    if (defined $Opts{sort_sub}) {
        require Sort::Sub;
        my $sorter = Sort::Sub::get_sorter($Opts{sort_sub}, $Opts{sort_args});
        @keys = sort $sorter @keys;
    } else {
        @keys = sort {
        $occurences{$b} <=> $occurences{$a} ||
            ($numeric ? $a <=> $b : $a cmp $b)
        } @keys;
    }

    for my $k (@keys) {
        my $n = $occurences{$k};
        next if defined $Opts{min_freq} && $n < $Opts{min_freq};
        next if defined $Opts{max_freq} && $n > $Opts{max_freq};
        if ($Opts{print_freq}) {
            unless (defined $fmt) {
                $fmt = "%" . ceil(log($n)/log(10)) . "s\t%s\n";
            }
            printf $fmt, $n, $k;
        } else {
            print $k, "\n";
        }
    }
}

# MAIN

parse_cmdline();
run();

1;
# ABSTRACT: Print frequency table of lines/words/characters/bytes/numbers
# PODNAME: freqtable

__END__

=pod

=encoding UTF-8

=head1 NAME

freqtable - Print frequency table of lines/words/characters/bytes/numbers

=head1 VERSION

This document describes version 0.007 of freqtable (from Perl distribution App-freqtable), released on 2022-05-20.

=head1 SYNOPSIS

 % freqtable [OPTIONS] < INPUT

Sample input:

 % cat input-lines.txt
 one
 one
 two
 three
 four
 five
 five
 five
 six
 seven
 eight
 eight
 nine

 % cat input-words.txt
 one one two three four five five five six seven eight eight nine

 % cat input-nums.txt
 9.99 cents
 9.99 dollars
 9 cents

=head2 Modes

Display frequency table (by default: lines):

 % freqtable input-lines.txt
 3       five
 2       eight
 2       one
 1       four
 1       nine
 1       seven
 1       six
 1       three
 1       two

Display frequency table (words):

 % freqtable -w input-words.txt
 3       five
 2       eight
 2       one
 1       four
 1       nine
 1       seven
 1       six
 1       three
 1       two

Display frequency table (characters):

 % freqtable -c input-words.txt
 12
 12      e
  7      i
  5      n
  4      f
  4      o
  4      t
  4      v
  3      h
  2      g
  2      r
  2      s
  1

  1      u
  1      w
  1      x

Display frequency table (nums):

 % freqtable -n input-nums.txt
 2      9.99
 1      9

Display frequency table (integers):

 % freqtable -i input-nums.txt
 3      9

=head2 -F option

Don't display the frequencies:

 % freqtable -F input-lines.txt
 five
 eight
 one
 four
 nine
 seven
 six
 three
 two

=head2 Filter by frequencies

Only display lines that appear three times:

 % freqtable -F input-lines.txt --freq 3
 3       five

Only display lines that appear more than once:

 % freqtable -F input-lines.txt --freq 2-
 3       five
 2       eight
 2       one

Only display lines that appear less than three times:

 % freqtable -F input-lines.txt --freq -2
 2       eight
 2       one
 1       four
 1       nine
 1       seven
 1       six
 1       three
 1       two

=head2 Sorting

Instead of the default sorting by frequency (descending order), if you specify
C<--sort-sub> (and optionally one or more C<--sort-arg>) you can sort by the
keys using one of L<Sort::Sub>::* subroutines. Examples:

 # sort by keys, asciibetically
 % freqtable -F input-lines.txt --sort-sub asciibetically
 2       eight
 3       five
 1       four
 1       nine
 2       one
 1       seven
 1       six
 1       three
 1       two

 # sort by keys, asciibetically (descending order)
 % freqtable -F input-lines.txt --sort-sub 'asciibetically<r>'
 1       two
 1       three
 1       six
 1       seven
 2       one
 1       nine
 1       four
 3       five
 2       eight

 # sort by keys, randomly using perl code (essentially, shuffling)
 % freqtable -F input-lines.txt --sort-sub 'by_perl_code' --sort-arg 'code=int(rand()*3)-1'
 3       five
 1       three
 2       eight
 1       seven
 2       one
 1       six
 1       nine
 1       two
 1       four

=head1 DESCRIPTION

This utility counts the occurences of lines (or words/characters) in the input
then display each unique lines along with their number of occurrences. You can
also instruct it to only show lines that have a specified number of occurrences.

You can use the following Unix command to count occurences of lines:

 % sort input-lines.txt | uniq -c | sort -nr

and with a bit more work you can also use a combination of existing Unix
commands to count occurrences of words/characters, as well as filter items that
have a specified number of occurrences; freqtable basically offers convenience.

=head1 EXIT CODES

0 on success.

255 on I/O error.

99 on command-line options error.

=head1 OPTIONS

=over

=item * --bytes, -c

=item * --chars, -m

=item * --words, -w

=item * --lines, -l

=item * --number, -n

Treat each line as a number. A line like this:

 9.99 cents

will be regarded as:

 9.99

=item * --integer, -i

Treat each line as an integer. A line like this:

 9.99 cents

will be regarded as:

 9

=item * --ignore-case, -f

=item * --no-print-freq, -F

Will not print the frequencies.

=item * --freq=s

Filter by frequencies. C<N> (e.g. --freq 5) means only display items that occur
N times. C<M-N> (e.g. --freq 5-10) means only display items that occur between M
and N times. C<M-> (e.g. --freq 5-) means only display items that occur at least
M times. C<-N> (e.g. --freq -10) means only display items that occur at most N
times.

=item * --sort-sub=s

This will cause C<freqtable> to sort by key name instead of by frequencies. You
pass this option to specify a L<Sort::Sub> routine, which is the name of a
C<Sort::Sub::*> module without the C<Sort::Sub::> prefix, e.g.
C<asciibetically>. The name can optionally be followed by C<< <i> >>, or C<< <r>
>>, or C<< <ir> >> to mean case-insensitive sorting, reverse order, and reverse
order case-insensitive sorting, respectively. When you use one of these suffixes
on the command-line, remember to quote since C<< < >> and C<< > >> can be
intereprted by shell.

Examples:

 asciibetically
 asciibetically<i>
 by_length<r>

=item * --sort-arg=ARGNAME=ARGVALUE

Pass argument(s) to the sort subroutine. Can be specified multiple times, once
for every argument.

=item * -a

Shortcut for C<--sort=asciibetically>.

=back

=head1 FAQ

=head1 HOMEPAGE

Please visit the project's homepage at L<https://metacpan.org/release/App-freqtable>.

=head1 SOURCE

Source repository is at L<https://github.com/perlancar/perl-App-freqtable>.

=head1 SEE ALSO

Unix commands B<wc>, B<sort>, B<uniq>

L<wordstat> from L<App::wordstat>

L<csv-freqtable> from L<App::CSVUtils>

=head1 AUTHOR

perlancar <perlancar@cpan.org>

=head1 CONTRIBUTING


To contribute, you can send patches by email/via RT, or send pull requests on
GitHub.

Most of the time, you don't need to build the distribution yourself. You can
simply modify the code, then test via:

 % prove -l

If you want to build the distribution (e.g. to try to install it locally on your
system), you can install L<Dist::Zilla>,
L<Dist::Zilla::PluginBundle::Author::PERLANCAR>, and sometimes one or two other
Dist::Zilla plugin and/or Pod::Weaver::Plugin. Any additional steps required
beyond that are considered a bug and can be reported to me.

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2022, 2018 by perlancar <perlancar@cpan.org>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=head1 BUGS

Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-freqtable>

When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.

=cut
