#!/usr/bin/env perl

=head1 NAME

suricata_extract_submit - Submits files extracted by Suricata to mojo_cape_submit for detonation via CAPEv2

=head1 SYNOPSIS

suricata_extract_submit

*/5 * * * * /usr/local/bin/suricata_extract_submit 2> /dev/null > /dev/null

=head1 DESCRIPTION

What is printed to standard out is also sent to syslog.

This requires having the file-store output configured akin to below.

      - file-store:
          version: 2
          enabled: yes
          dir: /var/log/suricata/files
          write-fileinfo: yes
          stream-depth: 0
          force-hash: [sha1, md5]
          xff:
            enabled: no
            mode: extra-data
            deployment: reverse
            header: X-Forwarded-For

The force-hash values are optional as this script will automatically compute
those for inclusion as well SHA256.

This will use `/var/run/suricata_extract_submit.pid` as a PID file
to ensure only a single instance is running.

May be checked to see if it has hung like below. Below will alert if a PID
file with a M time of older than 5 minutes.

    /usr/local/libexec/nagios/check_file_age -i -f /var/run/suricata_extract_submit.pid

=head1 CONFIGURATION

The config file used is '/usr/local/etc/suricata_extract_submit.ini'.

    # the API key to use if needed
    #apikey=
    # URL to find mojo_cape_submit at
    url=http://192.168.14.15:8080/
    # the group/client/whathaveya slug
    slug=foo
    # where Suricata has the file store at
    filestore=/var/log/suricata/files
    # a file of IPs or subnets to ignore SRC or DEST IPs of
    #ignore=
    # a file of regex to use for checking host names go ignore
    #ignoreHosts=
    # if it should use HTTPS_PROXY and HTTP_PROXY from ENV or not
    env_proxy=0
    # stats file holding only the stats for the last run
    stats_file=/var/cache/suricata_extract_submit_stats.json
    # stats dir
    stats_dir=/var/cache/suricata_extract_submit_stats/

=cut

use File::Find::Rule;
use warnings;
use strict;
use Config::Tiny;
use File::Slurp qw(append_file write_file read_file);
use Time::Piece;
use File::MimeInfo::Magic;
use Crypt::Digest::MD5 qw( md5 md5_hex md5_b64 md5_b64u
	md5_file md5_file_hex md5_file_b64 md5_file_b64u );
use Crypt::Digest::SHA256 qw( sha256 sha256_hex sha256_b64 sha256_b64u
	sha256_file sha256_file_hex sha256_file_b64 sha256_file_b64u );
use Crypt::Digest::SHA1 qw( sha1 sha1_hex sha1_b64 sha1_b64u
	sha1_file sha1_file_hex sha1_file_b64 sha1_file_b64u );
use Net::Subnet qw(subnet_matcher);
use LWP::UserAgent;
use HTTP::Request::Common;
use File::Temp qw/ tempdir  /;
use File::Copy;
use POSIX 'strftime';
use Sys::Syslog;
use JSON;
use Sys::Hostname;
use Proc::PID::File;

my $t          = localtime;
my $today_name = $t->strftime('%F');

# sends stuff to syslog
sub log_something {
	my ( $level, $message ) = @_;

	if ( !defined($level) ) {
		$level = 'info';
	}

	openlog( 'suricata_extract_submit', 'cons,pid', 'daemon' );
	syslog( $level, '%s', $message );
	closelog();

	print $message. "\n";
} ## end sub log_something

my $alive;
eval { $alive = Proc::PID::File->running(); };
if ($@) {
	log_something( 'err', 'PID file check failed... ' . $@ );
	exit 1;
}
if ($alive) {
	log_something( 'err', 'Already running as ' . $alive );
	exit 1;
}

log_something( 'info', 'Suricata Malware Extractor starting up..' );

# sensor config file
my $config_file = '/usr/local/etc/suricata_extract_submit.ini';
my $config      = Config::Tiny->read($config_file);
log_something( 'info', 'Config ' . $config_file . ' read...' );

# information for submitting it...
my $apikey = $config->{_}->{apikey};
if ( !defined($apikey) ) {
	$apikey = '';
}
my $url = $config->{_}->{url};
if ( !defined($url) ) {
	my $message = 'No malware detonator specified via the setting url';
	log_something( 'err', $message );
}

# make sure we have a slug
my $slug = $config->{_}->{slug};
if ( !defined($url) ) {
	my $message = 'No value for slug set';
	log_something( 'err', $message );
}

# get the file store dir
my $filestore = $config->{_}->{filestore};
if ( !defined($filestore) ) {
	$filestore = '/var/log/suricata/files/';
}

# get the env proxy
my $use_env_proxy = $config->{_}->{env_proxy};
if ( !defined($use_env_proxy) ) {
	$use_env_proxy = 0;
}

# get the stats file
my $stats_file = $config->{_}->{stats_file};
if ( !defined($stats_file) ) {
	$stats_file = '/var/cache/suricata_extract_submit_stats.json';
}

# get the stats dir
my $stats_dir = $config->{_}->{stats_dir};
if ( !defined($stats_dir) ) {
	$stats_dir = '/var/cache/suricata_extract_submit_stats/';
}

my $stats_default = {
	sub                   => 0,
	sub_delta             => 0,
	sub_fail              => 0,
	sub_fail_delta        => 0,
	zero_sized            => 0,
	zero_sized_delta      => 0,
	sub_2xx               => 0,
	sub_2xx_delta         => 0,
	sub_3xx               => 0,
	sub_3xx_delta         => 0,
	sub_4xx               => 0,
	sub_4xx_delta         => 0,
	sub_5xx               => 0,
	sub_5xx_delta         => 0,
	last_errors           => [],
	errors                => 0,
	errors_delta          => 0,
	ignored_ip            => 0,
	ignored_ip_delta      => 0,
	ignored_ip_src        => 0,
	ignored_ip_src_delta  => 0,
	ignored_ip_dest       => 0,
	ignored_ip_dest_delta => 0,
	ignored_host          => 0,
	ignored_host_delta    => 0,
	truncated             => 0,
	truncated_delta       => 0,
	sub_size              => 0,
	sub_size_delta        => 0,
};
my $stats;

# compute the deltas for these items
my @to_delta = (
	'sub',     'sub_fail', 'zero_sized',   'sub_2xx',    'sub_3xx',        'sub_4xx',
	'sub_5xx', 'errors',   'ignored_host', 'ignored_ip', 'ignored_ip_src', 'ignored_ip_dest',
	'truncated', 'sub_size',
);

# handle the stats file if it exists and is readable
if ( -f $stats_file && -r $stats_file ) {
	eval { $stats = decode_json( read_file($stats_file) ); };
	if ($@) {
		log_something( 'err',
			'stats file, "' . $stats_file . '", reading or parsing failed, using default stats... ' . $@ );
		$stats = $stats_default;
	} else {
		# make sure each item we will need exists and if not, zero it
		foreach my $item (@to_delta) {
			if ( !defined( $stats->{$item} ) ) {
				$stats->{$item} = 0;
			}
		}
	}

}    # if it exists, but is not readable, error and use the default
elsif ( -f $stats_file && !-r $stats_file ) {
	my $error = 'stats file, "' . $stats_file . '", exists, but is not readable, using default stats';
	log_something( 'err', $error );
	$stats = $stats_default;
	$stats->{errors}++;
	push( @{ $stats->{last_errors} }, $error );
} else {
	$stats = $stats_default;
}
my %copy_stats = %{$stats};
my $new_stats  = \%copy_stats;

# reset last_errors instead of having it constantly grow
$new_stats->{last_errors} = [];

# read in the file of IPs to ignore
my $ignore_file = $config->{_}->{ignore};
my @to_ignore;
if ( defined($ignore_file) && -f $ignore_file ) {
	log_something( 'info', 'reading in ignore file... ' . $ignore_file );
	my $ignore_raw = read_file($ignore_file);

	my @to_ignore_raw = split( /\n/, $ignore_raw );
	@to_ignore_raw = grep( !/^\w*\#/, @to_ignore_raw );
	@to_ignore_raw = grep( !/^\w*$/,  @to_ignore_raw );

	# process the @to_ignore_raw adding subnets if needed
	my $ignored = 0;
	foreach my $item (@to_ignore_raw) {

		# remove any white space at the start or end
		$item =~ s/^[\ \t]+//;
		$item =~ s/[\ \t]+$//;
		if ( $item =~ /^[0-9\.]+$/ ) {
			$item = $item . '/32';
		}
		if ( $item =~ /^[0-9\:]+$/ ) {
			$item = $item . '/128';
		}
		push( @to_ignore, $item );

		$ignored++;
	} ## end foreach my $item (@to_ignore_raw)

	log_something( 'info', 'ignoring ' . $ignored . ' subnets or IPs' );
} ## end if ( defined($ignore_file) && -f $ignore_file)
my $ip_ignore = subnet_matcher @to_ignore;

# read in the file of IPs to ignore
my $ignoreHosts_file = $config->{_}->{ignoreHosts};
my @hosts_to_ignore;
if ( defined($ignoreHosts_file) && -f $ignoreHosts_file ) {
	log_something( 'info', 'reading in ignoreHosts file... ' . $ignoreHosts_file );
	my $ignoreHosts_raw = read_file($ignoreHosts_file);

	@hosts_to_ignore = split( /\n/, $ignoreHosts_raw );
	@hosts_to_ignore = grep( !/^\w*\#/, @hosts_to_ignore );
	@hosts_to_ignore = grep( !/^\w*$/,  @hosts_to_ignore );

	my $found_regex = $#hosts_to_ignore;
	log_something( 'info', $found_regex . ' regexs loaded to ignore for hosts' );
} ## end if ( defined($ignoreHosts_file) && -f $ignoreHosts_file)

# create a tmpdir and chdir to it
my $tempdir = tempdir();
chdir($tempdir);

# find any json files
log_something( 'info', 'Looking for JSON files under ' . $filestore );
my @files = File::Find::Rule->file()->name("*json")->in($filestore);

# exit if we don't find anything as there is no point to continue
if ( !defined( $files[0] ) ) {
	log_something( 'info', 'None found at this time' );

	foreach my $item (@to_delta) {
		my $old = $stats->{$item};
		my $new = $new_stats->{$item};
		my $delta;
		if ( !defined($old) ) {
			$delta = $new;
		} elsif ( $new > $old ) {

			$delta = $new - $old;

			# roll it over if it is over 2G
			if ( $new > 2000000000 ) {
				$new_stats->{$item} = $delta;
			}
		} elsif ( $old > $new ) {
			$delta = $new;
		} else {
			$delta = 0;
		}
		$new_stats->{ $item . '_delta' } = $delta;
	} ## end foreach my $item (@to_delta)

	# write out the stats file
	my $raw_stats = JSON->new->utf8->canonical(1)->encode($new_stats);
	log_something( 'info', 'Run Stats: ' . $raw_stats );
	eval { write_file( $stats_file, $raw_stats ); };
	if ($@) {
		log_something( 'err', 'Writing stats file, "' . $stats_file . '", failed... ' . $@ );
	}

	exit 0;
} ## end if ( !defined( $files[0] ) )

# process any found
foreach my $file (@files) {
	print "\n";

	log_something( 'info', "Processing... " . $file );
	my $name;
	eval {
		my $file_raw = read_file($file);
		log_something( 'info', 'Raw JSON... ' . $file_raw );

		my $file_json = decode_json($file_raw);
		my $data_file = $file;
		$data_file =~ s/\.[0-9\.]+\.json$//;
		log_something( 'info', 'Sample File: ' . $data_file );

		my $data      = read_file($data_file);
		my $data_size = length($data);
		log_something( 'info', 'Sample Size: ' . $data_size );
		$new_stats->{sub_size} += $data_size;

		# get the timestramp and transform it into for getting the epoch
		my $timestamp = $file_json->{timestamp};
		$timestamp =~ s/\..*$//;
		my $t = Time::Piece->strptime( $timestamp, '%Y-%m-%dT%H:%M:%S' );

		my $filename = $file_json->{fileinfo}{filename};

		my $md5    = lc( md5_hex($data) );
		my $sha1   = lc( sha1_hex($data) );
		my $sha256 = lc( sha256_hex($data) );
		my $extUID = lc( substr( $md5, 0, 18 ) );

		my $mime = mimetype($filename);
		if ( !defined($mime) ) {
			$mime = 'application_x-ms-dos-executable';
		} else {
			$mime =~ s/\//\_/g;
		}
		log_something( 'info', 'MIME type: ' . $mime );

		# check if it was truncated or not for stats purposes
		if (   defined( $file_json->{fileinfo} )
			&& defined( $file_json->{fileinfo}{state} )
			&& $file_json->{fileinfo}{state} =~ /^[Tt][Rr][Uu][Cc][Aa][Tt][Ee][Dd]$/ )
		{
			$new_stats->{truncated}++;
		}

		my $add_it = 1;
		if ( $ip_ignore->( $file_json->{src_ip} ) ) {
			my $file_hostname = '';
			if ( defined( $file_json->{http} ) && defined( $file_json->{http}{hostname} ) ) {
				$file_hostname = $file_json->{http}{hostname};
			}
			log_something( 'info',
					  'Ignore Listed Source IP: '
					. $file_json->{src_ip}
					. ' ... dest_ip='
					. $file_json->{dest_ip}
					. ', hostname='
					. $file_hostname );
			$new_stats->{ignored_ip}++;
			$new_stats->{ignored_ip_src}++;
			$add_it = 0;
		} ## end if ( $ip_ignore->( $file_json->{src_ip} ) )
		if ( $ip_ignore->( $file_json->{dest_ip} ) ) {
			my $file_hostname = '';
			if ( defined( $file_json->{http} ) && defined( $file_json->{http}{hostname} ) ) {
				$file_hostname = $file_json->{http}{hostname};
			}
			log_something( 'info',
					  'Ignore Listed Destination IP: '
					. $file_json->{dest_ip}
					. ' ... src_ip='
					. $file_json->{src_ip}
					. ', hostname='
					. $file_hostname );
			$new_stats->{ignored_ip}++;
			$new_stats->{ignored_ip_dest}++;
			$add_it = 0;
		} ## end if ( $ip_ignore->( $file_json->{dest_ip} ))
		foreach my $item (@hosts_to_ignore) {
			if ( defined( $file_json->{http} ) && defined( $file_json->{http}{hostname} ) ) {
				if ( $file_json->{http}{hostname} =~ /$item/ ) {
					log_something( 'info',
							  'Ignore Listed Host, "'
							. $item . '": '
							. $file_json->{http}{hostname}
							. ' ... src_ip='
							. $file_json->{dest_ip}
							. ', dest_ip='
							. $file_json->{dest_ip} );
					$add_it = 0;
					$new_stats->{ignored_host}++;
				} ## end if ( $file_json->{http}{hostname} =~ /$item/)
			} ## end if ( defined( $file_json->{http} ) && defined...)
		} ## end foreach my $item (@hosts_to_ignore)

		$name
			= $file_json->{src_ip} . '-'
			. $file_json->{src_port} . '-'
			. $file_json->{dest_ip} . '-'
			. $file_json->{dest_port} . '-'
			. $file_json->{proto} . '-'
			. $extUID . '-'
			. $slug . '-'
			. $t->epoch . '-'
			. $mime;

		# only add it if it is not white listed
		if ($add_it) {
			log_something( 'info', 'Name: ' . $name );

			# skip uploading it if it is zero sized
			if ( $data_size > 0 ) {

				# don't need to copy it if it is zero sized
				copy( $data_file, $name )
					or die 'Copy failed(' . $data_file . ' -> ' . $tempdir . '/' . $name . '): ' . $!;

				$file_json->{suricata_extract_submit} = {
					filename => $name,
					apikey   => $apikey,
					host     => hostname,
					to       => $config->{_}->{url},
					time     => time,
					md5      => $md5,
					sha256   => $sha256,
					sha1     => $sha1,
					slug     => $config->{_}->{slug},
				};
				my $res;
				eval {
					my $ua = LWP::UserAgent->new(
						ssl_opts => { verify_hostname => 0, SSL_verify_mode => 0 },
						timeout  => 30
					);
					# use HTTPS_PROXY/HTTP_PROXY from env if set to true
					if ($use_env_proxy) {
						$ua->env_proxy;
					}
					$res = $ua->request(
						POST $config->{_}->{url},
						Content_type => 'multipart/form-data',
						Content      => [
							apikey   => $apikey,
							filename => [$name],
							type     => 'suricata_extract',
							json     => encode_json($file_json),
						],
					);
				};

				# save error status for later
				my $sub_error = $@;

				# see if we can get a status line
				my $status_line;
				if ( defined($res) ) {
					eval { $status_line = $res->status_line; };

					# compute that status line stats
					if ( defined($status_line) ) {
						if ( $status_line =~ /^2\d\d/ ) {
							$new_stats->{sub_2xx}++;
						} elsif ( $status_line =~ /^3\d\d/ ) {
							$new_stats->{sub_3xx}++;
						} elsif ( $status_line =~ /^4\d\d/ ) {
							$new_stats->{sub_4xx}++;
						} elsif ( $status_line =~ /^5\d\d/ ) {
							$new_stats->{sub_5xx}++;
						}
					} ## end if ( defined($status_line) )
				} ## end if ( defined($res) )

				# handle submission errors
				if ($sub_error) {
					$new_stats->{sub_fail}++;

					# if this is defined, submission worked, but we got a sub error
					if ( defined($status_line) ) {
						my $error = "Failed to post... " . $res->status_line;
						push( @{ $new_stats->{last_errors} }, $error );
						die($error);
					}    # if we don't have a status line the submission eval never got that far
					else {
						$new_stats->{errors}++;
						my $error = "Failed to post... " . $sub_error;
						push( @{ $new_stats->{last_errors} }, $error );
						die($error);
					}
				} else {
					if ( $status_line =~ /^2\d\d/ ) {
						$new_stats->{sub}++;
						log_something( 'info', 'Uplodated Response Status: ' . $res->status_line );
					} else {
						$new_stats->{sub_fail}++;
						my $error = "Failed to post... " . $res->status_line;
						push( @{ $new_stats->{last_errors} }, $error );
						die($error);
					}
				} ## end else [ if ($sub_error) ]
			} else {
				$new_stats->{zero_sized}++;
				log_something( 'info', 'Not uploading as the sample is zero sized' );
			}
		} ## end if ($add_it)

		# rename the JSON file so we don't process it again
		move( $file, $file . '-processed' ) or die 'Appending "-processed" to the name of the JSON file...' . $!;
		if ($@) {
			my $error = 'Failed to rename ' . $file . ' ... ' . $@;
			$new_stats->{errors}++;
			push( @{ $new_stats->{last_errors} }, $error );
			log_something( 'err', $error );
		}
	};
	if ($@) {
		log_something( 'err', 'Processing failed... ' . $@ );
	}

	# only unlink the file if it exists... otherwise the copy failed
	if ( -f $name ) {

		# now that we are done with the tmp file, we can remove it
		unlink($name);
	}
} ## end foreach my $file (@files)

# update the stats
foreach my $item (@to_delta) {
	my $old = $stats->{$item};
	my $new = $new_stats->{$item};
	my $delta;
	# value was missing for some reason in the previous
	if ( !defined($old) ) {
		$delta = $new;
	}    # need to compute the delta as there was change
	elsif ( $new > $old ) {
		$delta = $new - $old;

		# roll it over if it is over 2G
		if ( $new > 2000000000 ) {
			$new_stats->{$item} = $delta;
		}
	}    # ran subsequently and rolled over? this should not happen in general usage... regard new as correct
	elsif ( $old > $new ) {
		$delta = $new;
	}    # none of the above, so zero
	else {
		$delta = 0;
	}
	$new_stats->{ $item . '_delta' } = $delta;
} ## end foreach my $item (@to_delta)

# set the timestamp for the when it was generated
$new_stats->{timestamp} = $t->epoch;

# write out the stats file
my $raw_stats = JSON->new->utf8->canonical(1)->encode($new_stats) . "\n";
log_something( 'info', 'Run Stats: ' . $raw_stats );
eval { write_file( $stats_file, $raw_stats ); };
if ($@) {
	log_something( 'err', 'Writing stats file, "' . $stats_file . '", failed... ' . $@ );
}

# append it to the daily stat cache
if ( !-d $stats_dir ) {
	eval { mkdir($stats_dir) };
	if ($@) {
		log_something( 'err', 'Creating stats cache dir, "' . $stats_dir . '", failed... ' . $@ );
		exit 1;
	}
}
eval {
	my $daily_cache = $stats_dir . '/' . $today_name . '.json';
	append_file( $daily_cache, $raw_stats );
};
