#!/usr/bin/perl
#!usr/bin/perl

=head1 NAME

gate2format.pl - converts gate xml format to opennlp or plain text 

=head1 SYNOPSIS
 
This program takes as input a directory of Gate XML NER files and converts 
them to opennlp and plain text format for each entity type annoated in the 
Gate xml file

=head1 USAGE

Usage: gate2format.pl OUTPUT_DIR INPUT_DIR [OPTIONS]

=head1 Optional Arguments:

Displays the quick summary of the program options.

=head2 --format [plain|opennlp]

The desired conversion format. Default is: opennlp

=head2 --debug

Prints debug information to command line. 

=head2 --version

Displays the version information.


=head2 --help

Displays the quick summary of program options.

=head1 OUTPUT

Comments on the options that were set

=head1 SYSTEM REQUIREMENTS

=over

=item * Perl (version 5.8.5 or better) - http://www.perl.org

=back

=head1 CONTACT US

    If you have trouble installing and excecuting this program
    please contact us at
    
    btmcinnes at vcu dot edu.

=head1 Author

 Bridget T. McInnes, Virginia Commonwealth University

=head1 COPYRIGHT

Copyright (c) 2017

 Bridget T. McInnes, Virginia Commonwealth University 
 bthomson at vcu dot edu
                     
This program is free software; you can redistribute it and/or modify it 
under the terms of the GNU General Public License as published by the Free 
Software Foundation; either version 2 of the License, or (at your option) 
any later version.

This program is distributed in the hope that it will be useful, but WITHOUT 
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with 
this program; if not, write to:

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,               
 Boston, MA  02111-1307, USA.          
  
=cut

##########################################################################

                        #   CODE STARTS HERE

##########################################################################
#  reference the getOption cpan page

use Getopt::Long;

eval(GetOptions( "version", "help", "format=s", "debug")) or die ("Please check the above mentioned option(s).\n");

#   if help is defined, print out help
if( defined $opt_help ) {
    $opt_help = 1;
    &showHelp;
    exit;
}

# if version is requested, show version
if( defined $opt_version ) {
    $opt_version = 1;
    &showVersion();
    exit;
}

if($#ARGV < 1) { 
    print STDERR "OUTPUT_DIR and INPUT_DIR must be specified on the command line.\n";
    &askHelp(); 
    exit;
}


my $output = shift; 
my $dir = shift;

if( -e $output) { 
    print STDERR "Output directory ($output) already exists. Do you want to continue (Y|N)?\n";
    my $answer = <STDIN>; 
    if(! ($answer=~/[Y|y]/)) { exit; }
}

if(!(-e $dir)) { 
    print STDERR "Directory ($dir) does not exist\n";
    &askHelp(); 
    exit;
}

opendir(DIR, $dir) || die "Could not open dir ($dir)\n";
my @files = grep { $_ ne '.' and $_ ne '..' } readdir DIR; close DIR;


my $format = "opennlp";
if(defined $opt_format) { 
    $format = $opt_format; 
}

if(! ($format=~/(opennlp|plain)/) ) {
    print STDERR "Format option ($format) is not available\n";
    &askHelp(); 
    exit;
}
my %instances = (); 

foreach my $GateFile (@files) { 

    if(defined $opt_debug) { print STDERR "FILE : $GateFile\n"; }

    my $TextWithNodesFlag = 0; 
    my $AnnotationSetKeyFlag = 0; 

    my $text = ""; 

    my %TypeHash = (); 
    my %EndNodeHash = (); 
    my %StartNodeHash = (); 
    my %SentenceNodes = (); 
    my %Nodes = (); 

    open(GATE, "$dir/$GateFile") || die "Could not open file ($GateFile)\n";
    
    while(<GATE>) { 
	chomp; 
	
	if($_=~/<TextWithNodes>/) { 
	    $TextWithNodesFlag = 1; 
	}
	if($_=~/<AnnotationSet Name=\"[K|k]ey?\">/) {
	    $AnnotationSetKeyFlag = 1; 
	}
	
	if($TextWithNodesFlag == 1) { 
	    $text .= "$_ ";
	    
	}
	
	if($_=~/$Annotation Id=\"(.*)\" Type=\"Sentence\" StartNode=\"([0-9]+)\" EndNode=\"([0-9]+)\"/) { 
	    $SentenceNodes{$2}{$3} = $1; 
	}

	if($AnnotationSetKeyFlag == 1) { 
	    if($_=~/<Annotation Id=\"(.*?)\" Type=\"(.*?)\" StartNode=\"(.*?)\" EndNode=\"(.*?)\">/) { 
		my $id = $1; 
		my $type = lc($2); 
		my $startnode = $3; 
		my $endnode = $4;
		
		$type=~s/\s+//g; 
		$type=~s/\(//g; 
		$type=~s/\)//g; 
		$type=~s/\//-/g; 
		
		$TypeHash{$id} = $type;
		$StartNodeHash{$id} = $startnode; 
		$EndNodeHash{$id} = $endnode; 
	    }
	}
	
	if($_=~/<\/AnnotationSet>/) { 
	    $AnnotationSetKeyFlag = 0; 
	}
	if($_=~/<\/TextWithNodes>/) { 
	    $TextWithNodesFlag = 0; 
	}
    }
    close GATE; 
    
    $text=~s/\s+/ /g; 
    
    
    foreach my $id (sort keys %TypeHash) { 
	#if($id ne "336") { next; }
	my $type = $TypeHash{$id};
	my $startnode = $StartNodeHash{$id}; 
	my $endnode = $EndNodeHash{$id};
	
	my $sentencestart = ""; my $sentenceend = ""; 
	foreach my $sstart (sort keys %SentenceNodes) { 
	    if($startnode >= $sstart) { 
		foreach my $send (sort keys %{$SentenceNodes{$sstart}}) { 
		    if($send >= $endnode) { 
			$sentencestart = $sstart; 
			$sentenceend = $send; 
		    }
		}
	    }
	}
	
	$text=~/<Node id=\"$sentencestart\"\/>(.*?)<Node id=\"$sentenceend\"\/>/; 
	my $sentence = $&; 
	
	
	if($format=~/plain/) {   
	    
	    $sentence=~s/<Node id=\"$startnode\"\/>/<head item=\"synthesis\" instance=\"$id\" sense=\"$type\">/g; 
	    $sentence=~s/<Node id=\"$endnode\"\/>/<\/head>/g; 
	}
	else { 
	    $sentence=~s/<Node id=\"$startnode\"\/>/<START:$type> /g; 
	    $sentence=~s/<Node id=\"$endnode\"\/>/ <END>/g; 
	}
	
	$sentence=~s/<Node id=\"[0-9]+\"\/>/ /g;       
	$sentence=~s/<(\/)?TextWithNodes>/ /g; 
	$sentence=~s/\ / /g; 
	$sentence=~s/\±/+-/g; 
	$sentence=~s/--+/ /g; 
	$sentence=~s/\®//g; 
	$sentence=~s/&#x[a-zA-Z0-9\;]+/ /g; 
	$sentence=~s/^[:ascii:]]/ /g; 
	$sentence=~s/\( /(/g; 
	$sentence=~s/\. ([0-9]+) /.\1/g; 
	$sentence=~s/\. ([A-Z]\)?) /.\1/g; 
	$sentence=~s/ \/ /\//g; 
	$sentence=~s/ ([\.\:\;\?\'\%\$\#\@\,\)])/\1/g; 

	$sentence=~s/\s+/ /g; 
	$sentence=~s/^\s*//g; 
	$sentence=~s/\s*$//g; 

	
	#$sentence = "$endnode $startnode $sentencestart $sentenceend $sentence"; 
	#print "$sentence\n";
	push @{$instances{$type}}, $sentence; 
    }
    
}

foreach my $type (sort keys %instances) { 
    open(FILE, ">$output/$type") || die "Could not open file ($type)\n";
    foreach my $instance (@{$instances{$type}}) { 
	if($instance=~/^\s*$/) { next; }
	print FILE "$instance\n";
    }
    close FILE; 
}

##############################################################################                 

#  function to output help messages for this program                                         

##############################################################################                

sub showHelp() {

    print "This is a utility that converts gate formated NER data to opennlp \n"; 
    print "plain text format. \n\n"; 

    print "Usage: gate2format.pl OUTPUT_DIR INPUT_DIR [OPTIONS] \n\n";

    print "OPTIONS:\n\n";

    print "--format [plain|opennlp] Specified format (Default: opennlp)\n\n"; 

    print "--debug                  Prints debug information to STDERR\n\n"; 

    print "--version                Prints the version number\n\n";

    print "--help                   Prints this help message.\n\n";
}

##############################################################################            

#  function to output the version number                                                   

##############################################################################              

sub showVersion {
    print '$Id: gate2format.pl,v 1.0 2015/10/02 11:17 btmcinnes Exp $';
    print "\nCopyright (c) 2017- Bridget McInnes\n"; 
}

##############################################################################              

#  function to output "ask for help" message when user's goofed                              

##############################################################################               

sub askHelp {
    print STDERR "Type gate2format.pl --help for help.\n";
}


