#!/usr/bin/perl
# script: make_synth_collection.pl
# functionality: Makes a synthetic document set
#

use strict;
use warnings;

use File::Spec;
use Getopt::Long;

use Clair::Config;
use Clair::Utils::CorpusDownload;
use Clair::SyntheticCollection;
use Clair::RandomDistribution::Gaussian;
use Clair::RandomDistribution::LogNormal;
use Clair::RandomDistribution::Poisson;
use Clair::RandomDistribution::RandomDistributionFromWeights;
use Clair::RandomDistribution::Zipfian;

sub usage;

my $corpus_name = "";
my $output_name = "";
my $output_dir = "";
my $base_dir = "produced";
my $term_policy = "";
my $doclen_policy = "";
my $num_docs = 0;
my $verbose = 0;

# Distribution parameters
my $term_alpha = 0.0;
my $term_mean = 0.0;
my $term_variance = 0.0;
my $term_std_dev = 0.0;
my $term_lambda = 0.0;

my $doclen_alpha = 0.0;
my $doclen_mean = 0.0;
my $doclen_variance = 0.0;
my $doclen_std_dev = 0.0;
my $doclen_lambda = 0.0;

my $k = 0;
my $n = 0;

my $res = GetOptions("corpus=s" => \$corpus_name, "base=s" => \$base_dir,
                     "size=i" => \$num_docs,
                     "term-policy=s" => \$term_policy,
                     "doclen-policy=s" => \$doclen_policy,
                     "output=s" => \$output_name,
                     "directory=s" => \$output_dir, "verbose!" => \$verbose,
                     "term-alpha:f" => \$term_alpha,
                     "term-mean:f" => \$term_mean,
                     "term-variance:f" => \$term_variance,
                     "term-std_dev:f" => \$term_std_dev,
                     "term-lambda:f" => \$term_lambda,
                     "doclen-alpha:f" => \$doclen_alpha,
                     "doclen-mean:f" => \$doclen_mean,
                     "doclen-variance:f" => \$doclen_variance,
                     "doclen-std_dev:f" => \$doclen_std_dev,
                     "doclen-lambda:f" => \$doclen_lambda,
                     "k:i" => \$k, "n:i" => \$n);

if (!$res or ($num_docs == 0) or
    (($corpus_name eq "") and ($doclen_policy ne "constant")) or
    ($output_name eq "") or ($output_dir eq "") or ($term_policy eq "") or
    ($doclen_policy eq "")) {
  usage();
  exit;
}

my $gen_dir = "$base_dir";

my $corpus_data_dir = "$gen_dir/corpus-data/$corpus_name";

my %doclen = ();
my %tc = ();
if ($doclen_policy ne "constant") {
  my $corpus = Clair::Utils::CorpusDownload->new(corpusname => "$corpus_name",
                                                 rootdir => "$gen_dir");

  # index the corpus
  my $pwd = `pwd`;
  chomp $pwd;

  # Get the document length distribution
  %doclen = $corpus->get_doc_len_dist();
  # Get term counts
  %tc = $corpus->get_term_counts();

  chdir $pwd;
} else {
  # Constant document length
  for (my $i = 0; $i < $num_docs; $i++) {
    $doclen{$i} = $n;
  }
}

my @doclen_weights = ();
my @lengths = ();
my @term_weights = ();
my @terms = ();

my $num_terms;
if ($k) {
  for (my $i = 0; $i <= $k; $i++) {
    # Convert to alpha
    my @ascii = unpack("C*", "$i");
    foreach my $c (@ascii) {
      $c += 49;
    }

    my $out = pack("C*", @ascii);
    push @terms, $out;
  }
  $num_terms = $k;
} else {
  $num_terms = scalar(keys %tc);
}

# Get document length weights
if ($doclen_policy ne "constant") {
  foreach my $j (sort {$doclen{$a} cmp $doclen{$b}} keys %doclen) {
    push @doclen_weights, $doclen{$j};
    if ($doclen_policy eq "mirror") {
      # Include the length of every document in the lengths parameter
      # If we're mirroring the document distribution
      for (my $m = 0; $m < $doclen{$j}; $m++) {
        push @lengths, $j;
      }
    } else {
      push @lengths, $j;
    }

  }
}

# Get term weights
if (!$k) {
  foreach my $k (sort {$tc{$b} cmp $tc{$a}} keys %tc) {
    push @term_weights, $tc{$k};
    push @terms, $k;
  }
}

my $a;
my $b;

if ($verbose) { print "Reading in term distribution...\n"; }
if ($verbose) { print "Reading in document length distribution...\n"; }


if ($term_policy eq "randomdistributionfromweights") {
  $a = Clair::RandomDistribution::RandomDistributionFromWeights->new(weights
                                                                     => \@term_weights);
} elsif ($term_policy eq "gaussian") {
  $a = Clair::RandomDistribution::Gaussian->new(mean => $term_mean,
                                                variance => $term_variance,
                                                dist_size => $num_terms);
} elsif ($term_policy eq "lognormal") {
  $a = Clair::RandomDistribution::LogNormal->new(mean => $term_mean,
                                                 std_dev => $term_std_dev,
                                                 dist_size => $num_terms);
} elsif ($term_policy eq "poisson") {
  $a = Clair::RandomDistribution::Poisson->new(lambda => $term_lambda,
                                               dist_size => $num_terms);
} elsif ($term_policy eq "zipfian") {
  $a = Clair::RandomDistribution::Zipfian->new(alpha => $term_alpha,
                                               dist_size => $num_terms);
}

if ($doclen_policy eq "randomdistributionfromweights") {
  $b = Clair::RandomDistribution::RandomDistributionFromWeights->new(weights =>
                                                                     \@doclen_weights);
} elsif ($doclen_policy eq "gaussian") {
  $b = Clair::RandomDistribution::Gaussian->new(mean => $doclen_mean,
                                                variance => $doclen_variance,
                                                dist_size => $num_docs);
} elsif ($doclen_policy eq "lognormal") {
  $b = Clair::RandomDistribution::LogNormal->new(mean => $doclen_mean,
                                                 std_dev => $doclen_std_dev,
                                                 dist_size => $num_docs);
} elsif ($doclen_policy eq "poisson") {
  $b = Clair::RandomDistribution::Poisson->new(lambda => $doclen_lambda,
                                               dist_size => $num_docs);
} elsif ($doclen_policy eq "zipfian") {
  $b = Clair::RandomDistribution::Zipfian->new(alpha => $doclen_alpha,
                                               dist_size => $num_docs);
}

if ($verbose) { print "Creating collection\n"; }
my $col;
if ($doclen_policy eq "constant") {
  # All documents have the same length
  $col = Clair::SyntheticCollection->new(name => $output_name,
                                         base => $output_dir,
                                         mode => "create_new",
                                         term_map => \@terms,
                                         term_dist => $a,
                                         doc_length => $n,
                                         size => $num_docs);
} elsif ($doclen_policy eq "mirror") {
  # Mirror the lengths of the existing corpus
  $col = Clair::SyntheticCollection->new(name => $output_name,
                                         base => $output_dir,
                                         mode => "create_new",
                                         term_map => \@terms,
                                         term_dist => $a,
                                         doclen_dist => $b,
                                         doclen_map => \@lengths,
                                         mirror_doclen => 1,
                                         size => $num_docs);
} else {
  # Use some random distribution of document lengths
  $col = Clair::SyntheticCollection->new(name => $output_name,
                                         base => $output_dir,
                                         mode => "create_new",
                                         term_map => \@terms,
                                         term_dist => $a,
                                         doclen_dist => $b,
                                         doclen_map => \@lengths,
                                         size => $num_docs);
}

if ($verbose) { print "Generating documents\n"; }
$col->create_documents();
#
# Print out usage message
#
sub usage
{
  print "$0\n";
  print "Generate a synthetic corpus\n";
  print "\n";
  print "usage: $0 -c corpus_name [-b base_dir]\n\n";
  print "  --output,-o name\n";
  print "       Name of the generated corpus\n";
  print "  --directory,-d output directory\n";
  print "       Directory to output generated corpus in\n";
  print "  --corpus,-c corpus_name\n";
  print "       Name of the source corpus\n";
  print "  --base,-b base_dir\n";
  print "       Base directory filename.  The corpus is loaded from here\n";
  print "  --term-policy,-t policy\n";
  print "       Term length distribution: {gaussian, lognormal, poisson, randomdistributionfromweights, zipfian}\n";
  print "  --doclen-policy\n";
  print "       Document length distribution: {gaussian, lognormal, poisson, randomdistributionfromweights, zipfian, constant, mirrored}\n";
  print "  --size, -s number_of_documents\n";
  print "       Number of documents to generate\n";
  print "  --verbose,-v\n";
  print "       Increase debugging verbosity\n";
  print "\n";
  print " The following arguments are required by the spcified policies:\n";
  print "Option and value    Policy               Argument Type\n";
  print "alpha               zipfian              positive float\n";
  print "mean                gaussian,lognormal   positive float\n";
  print "variance            gaussian             positive float\n";
  print "std_dev             lognormal            positive float\n";
  print "lambda              poisson              positive float\n";
  print "k                   constant             integer\n";
  print "   vocabulary size\n";
  print "n                   constant             integer\n";
  print "   number of terms in each document\n";
  print "lambda              poisson              positive float\n";
  print "\n";
  print "example: $0 --term-policy zipfian --term-alpha 1 --doclen-policy randomdistributionfromweights -o synthy -d synth_out -c lexrank-sample -b produced -s 10 --verbose\n";

  exit;
}
