#!/usr/bin/env perl
use v5.14;

use Getopt::Long;
use HTTP::Tiny;
use Scalar::Util;

my $VERSION = '0.3.0';

binmode( STDIN,  ":encoding(UTF-8)" );
binmode( STDOUT, ":encoding(UTF-8)" );
binmode( STDERR, ":encoding(UTF-8)" );

my %opt;

sub cDefault {
    $opt{color} ? "\e[0;39m" . $_[0] . "\e[0m" : $_[0]    # default
}

sub cBold {
    $opt{color} ? "\e[1;39m" . $_[0] . "\e[0m" : $_[0]    # bold
}

sub cValue {
    $opt{color} ? "\e[0;32m" . $_[0] . "\e[0m" : $_[0]    # green
}

sub cName {
    $opt{color} ? "\e[0;34m" . $_[0] . "\e[0m" : $_[0]    # blue
}

sub cIdentifier {
    $opt{color} ? "\e[0;33m" . $_[0] . "\e[0m" : $_[0]    # yellow
}

sub cWarning {
    $opt{color} ? "\e[1;31m" . $_[0] . "\e[0m" : $_[0]    # bold red
}

sub achtung {
    say STDERR cWarning(@_);
}

my %namespaces = (

    # standard ontologies
    rdf    => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    xsd    => 'http://www.w3.org/2001/XMLSchema#',
    rdfs   => 'http://www.w3.org/2000/01/rdf-schema#',
    owl    => 'http://www.w3.org/2002/07/owl#',
    skos   => 'http://www.w3.org/2004/02/skos/core#',
    schema => 'http://schema.org/',
    geo    => 'http://www.opengis.net/ont/geosparql#',
    prov   => 'http://www.w3.org/ns/prov#',

    # Wikibase ontology
    wikibase => 'http://wikiba.se/ontology#',
    wd       => 'http://www.wikidata.org/entity/',
    wdt      => 'http://www.wikidata.org/prop/direct/',
    wds      => 'http://www.wikidata.org/entity/statement/',
    p        => 'http://www.wikidata.org/prop/',
    wdref    => 'http://www.wikidata.org/reference/',
    wdv      => 'http://www.wikidata.org/value/',
    ps       => 'http://www.wikidata.org/prop/statement/',
    psv      => 'http://www.wikidata.org/prop/statement/value/',
    pq       => 'http://www.wikidata.org/prop/qualifier/',
    pqv      => 'http://www.wikidata.org/prop/qualifier/value/',
    pr       => 'http://www.wikidata.org/prop/reference/',
    prv      => 'http://www.wikidata.org/prop/reference/value/',
    wdno     => 'http://www.wikidata.org/prop/novalue/',

    # blazegraph SPARQL extensions
    hint => 'http://www.bigdata.com/queryHints#',
    bd   => 'http://www.bigdata.com/rdf#',
    bds  => 'http://www.bigdata.com/rdf/search#',
    fts  => 'http://www.bigdata.com/rdf/fts#',

    # not used in Wikidata Query Service
    wdata => 'http://www.wikidata.org/wiki/Special:EntityData/',
    cc    => 'http://creativecommons.org/ns#',
);

# get command line options
Getopt::Long::Configure('bundling');
GetOptions(
    \%opt,
    'help|h|?', 'version|V', 'man', 'ontology', 'prefixes',
    'api=s',
    'format|f=s',
    'query|q=s',
    'ids|i!',
    'language|g=s',
    'color|C!',
    'no-execute|n!',
    'limit=i', '1!', '2!', '3!', '4!', '5!', '6!', '7!', '8!', '9!',
    'default-prefixes!',
    'response=s',    # not documented
    'export=s',
    'force!',
) or exit 1;

# use color by default if output is terminal
$opt{color} //= -t STDOUT ? 1 : 0;    ## no critic

if ( $opt{version} ) {
    say "wdq $VERSION";
    exit;
}
elsif ( $opt{help} ) {
    require Pod::Usage;
    my $help;
    open my $out, '>', \$help;
    Pod::Usage::pod2usage(
        -msg => cBold("wdq ")
          . cValue("[query] ")
          . cName("[OPTIONS]")
          . " < query\n"
          . cBold("wdq ")
          . cValue("lookup  ")
          . cName("[OPTIONS]")
          . " < ids\n",
        -sections => [qw(USAGE OPTIONS)],
        -exitval  => 'NOEXIT',
        -output   => $out,
        indent    => 2,
    );
    $help =~ s/\n\n  --/\n  --/gm;
    $help =~ s/^      /    /mg;
    if ( $opt{color} ) {
        $help =~ s/^([a-z]+:)/cBold($1)/mgei;          # headers
        $help =~ s/^(  --.*)$/cName($1)/mge;           # options
        $help =~ s/("[^"\n]+")/cValue($1)/mge;         # strings
        $help =~ s/(<[^>\n]+>)/cIdentifier($1)/mge;    # URLs
    }
    print $help;
    exit;
}
elsif ( $opt{man} ) {
    my $module = $opt{color} ? 'Pod::Text::Color' : 'Pod::Text';
    eval "require $module";     ## no critic
    eval "require App::wdq";    # may fail if pure script installed by hand
    $module->new->parse_from_file( $INC{'App/wdq.pm'} // $0 );
    exit;
}
elsif ( $opt{namespaces} ) {
    for my $prefix ( sort keys %namespaces ) {
        printf "%8s: %s\n", $prefix, $namespaces{$prefix};
    }
    exit;
}
elsif ( $opt{ontology} ) {
    my $help = <<'ONTOLOGY';
Entity (item/property)
 wd:Q_ <-- owl:sameAs --> wd:Q_ 
       --> rdfs:label, skos:altLabel, schema:description "_"@_
       --> schema:dateModified, schema:version
       --> wdt:P_ "_", URI, _:blank
       --> p:P_ Statement

Item
 wd:Q_ <-- schema:about <http://_.wikipedia.org/wiki/_>
                        --> schema:inLanguage, wikibase:badge 

Property
 wd:P_ --> wikibase:propertyType PropertyType
       --> wkibase:directClaim        wdt:P_
       --> wikibase:claim             p:P_
       --> wikibase:statementProperty ps:P_
       --> wikibase:statementValue    psv:P_
       --> wikibase:qualifier         pq:P_
       --> wikibase:qualifierValue    pqv:P_
       --> wikibase:reference         pr:P_
       --> wikibase:referenceValue    prv:P_
       --> wikibase:novalue           wdno:P_

PropertyType
 wikibase: String, Url, WikibaseItem, WikibaseProperty, CommonsMedia,
           Monolingualtext, GlobeCoordinate, Quantity, Time


Statement
 wds:_ --> wikibase:rank Rank
       --> a wdno:P_
       --> ps:P_ "_", URI, _:blank
       --> psv:P_ Value
       --> pq:P_ "_", URI, _:blank
       --> pqv:P_ Value
       --> prov:wasDerivedFrom Reference

Reference
 wdref:_ --> pr:P_ "_", URI
         --> prv:P_ Value

Rank
 wikibase: NormalRank, PreferredRank, DeprecatedRank, BestRank

Value (GlobecoordinateValue/QuantityValue/TimeValue)
 wdv:_ --> wikibase: geoLatitude, geoLongitude, geoPrecision, geoGlobe URI
       --> wikibase: timeValue, timePrecision, timeTimezone, timeCalendarModel
       --> wikibase: quantityAmount, quantityUpperBound, quantityLowerBound,
                     quantityUnit URI
ONTOLOGY
    $help =~ s/^([a-z]+)/cBold($1)/mgei;
    $help =~ s/ ([A-Z][A-Za-z]+)/" ".cBold($1)/mge;
    $help =~ s/(<--|-->|--)/cDefault($1)/mge;
    $help =~ s/ ([a-z]+:([a-zA-Z_]+)?|[a-z][a-zA-Z]+)/" ".cName($1)/mge;
    $help =~ s/(<[^ >]+>|@[a-z_]+)/cIdentifier($1)/mge;
    $help =~ s/("[^"\n]+")/cValue($1)/mge;
    say $help;
    exit;
}

# require only if actually needed
require RDF::Query;

# default SPARQL endpoint
$opt{api} //= 'https://query.wikidata.org/bigdata/namespace/wdq/sparql';

# default output format
$opt{format} = lc( $opt{format} // 'simple' );

# read query from STDIN by default
$opt{query} //= '-';

# add default prefixes by default
$opt{'default-prefixes'} //= 1;

# limit as single digit
foreach ( grep { $opt{$_} } 1 .. 9 ) {
    $opt{limit} = $_ if !$opt{limit} or $opt{limit} > $_;
}

my $action = @ARGV && $ARGV[0] =~ /^(query|lookup)$/ ? shift @ARGV : 'query';

# TODO: validate language tag
my $language = $opt{language} // do { my $l = $ENV{LANG}; $l =~ s/_.*//; $l };

my $exporter;
if ( $opt{export} || $opt{format} eq 'export' ) {
    $exporter = eval { 
        require Catmandu;
        Catmandu->exporter( $opt{export} // 'JSON' ); 
    };
    if ($@) {
        say STDERR "Option export requires Perl module " .
            "Catmandu::Exporter::$opt{export}!";
        exit 1;
    }
    elsif ( $opt{format} !~ /^(ldjson|simple)$/ ) {
        say STDERR "Option export overrides option format!";
        Catmandu->load();
    }
    $opt{format} = 'export';
}

sub simple_node {
    if ( !Scalar::Util::blessed( $_[0] ) ) {
        '';
    }
    elsif ( $_[0]->isa('RDF::Trine::Node::Resource') ) {
        $_[0]->uri_value;
    }
    elsif ( $_[0]->isa('RDF::Trine::Node::Literal') ) {
        $_[0]->literal_value;
    }
    else {
        $_[0]->sse;
    }
}

package App::wdq::Format {

    sub new {
        my $class = shift;
        bless { @_ }, $class;
        #bless { @_[ 1 .. -1 ] }, $_[0];
    }

    sub emit {
        my ( $format, $iterator, $vars, $out ) = @_;
        my $data = { out => $out, vars => $vars };
        $format->{pre}->($data) if $format->{pre};
        $iterator->each(
            sub {
                $format->{row}->( $data, $_[0] );
            }
        ) if $format->{row};
        $format->{post}->($data) if $format->{post};
    }
}

my %FORMATS = (

    # SPARQL JSON / XML
    json => 'json',
    xml  => 'xml',

    # SPARQL TSV
    tsv => App::wdq::Format->new(
        pre => sub {
            my $o = shift;
            say { $o->{out} } join( "\t", map { "?$_" } @{ $o->{vars} } );
        },
        row => sub {
            my ( $o, $row ) = @_;
            say { $o->{out} } join "\t",
              map { Scalar::Util::blessed($_) ? $_->as_ntriples : '' }
              map { $row->{$_} } @{ $o->{vars} };
        }
    ),

    # SPARQL CSV
    csv => App::wdq::Format->new(
        pre => sub {
            my $o = shift;
            say { $o->{out} } join cDefault(','),
              map { cName($_) } @{ $o->{vars} };
        },
        row => sub {
            my ( $o, $row ) = @_;
            say { $o->{out} } join cDefault(','), map {
                my $s = simple_node( $row->{$_} );
                if ( $s =~ /[",\x0A\x0D]/ ) {
                    $s =~ s/"/""/g;
                    $s = "\"$s\"";
                }
                cValue($s)
            } @{ $o->{vars} };
        }
    ),

    # simple JSON key-value structure
    simple => App::wdq::Format->new(
        pre => sub {
            my $o = shift;
            $o->{data} = [];
        },
        row => sub {
            my ( $o, $row ) = @_;
            push @{ $o->{data} },
              { map { $_ => simple_node( $row->{$_} ) } @{ $o->{vars} } };
        },
        post => sub {
            require JSON;
            my $o = shift;
            print { $o->{out} }
              JSON->new->pretty->canonical->encode( $o->{data} );
        }
    ),

    # simple line-delimited JSON key-value structure
    ldjson => App::wdq::Format->new(
        row => sub {
            my ( $o, $row ) = @_;
            require JSON;
            say { $o->{out} }
              JSON->new->canonical->encode(
                { map { $_ => simple_node( $row->{$_} ) } @{ $o->{vars} } } );
        }
    ),

    # pipe to Catmandu exporter
    export => App::wdq::Format->new(
        pre => sub {
            my $o = shift;
            $o->{exporter} = $exporter;
        },
        row => sub {
            my ( $o, $row ) = @_;
            $o->{exporter}->add(
                { map { $_ => simple_node( $row->{$_} ) } @{ $o->{vars} } } );
        },
        post => sub {
            $_[0]->{exporter}->commit;
        }
    ),
);

my $format = $FORMATS{ $opt{format} } // do {
    say STDERR "Unknown format: $opt{format}";
    exit 1;
};

if ($action eq 'lookup' and $format =~ /^(json|xml)$/) {
    achtung("Format $format not supported with method lookup yet!");
    exit 1;
}

sub expand_query {
    my $query = shift;

    if ( $query =~ /^\s*{/m ) {
        $query = "SELECT * WHERE $query";
    }
    elsif ( $query !~ /^[^{]*SELECT/ ) {
        $query = "SELECT * WHERE { $query }";
    }

    if ( $opt{'default-prefixes'} ) {

        # Add PREFIX for actually used and known prefixes
        my %ns;
        my $ps = join '|', keys %namespaces;
        $ns{$_} = $namespaces{$_} for $query =~ /($ps):[^\/]/mg;
        my @prefixes = map { "PREFIX $_: <$ns{$_}>" } sort keys %ns;
        $query = join "\n", @prefixes, $query;
    }

    $query;
}

sub execute_query {
    my ( $query, $format ) = @_;

    my $http =
      HTTP::Tiny->new( default_headers => { agent => "wdq/$VERSION" } );

    # TODO: HTTP POST
    my $url =
        $opt{api} . '?'
      . $http->www_form_urlencode( { format => $format, query => $query } );
    $http->get($url);
}

sub parse_response {
    my ( $json, @map ) = @_;

    require RDF::Trine::Iterator::JSONHandler;
    my $iter = RDF::Trine::Iterator::JSONHandler->new->parse($json);

    unshift @map, sub {
        my $r = $_;
        foreach my $name ( keys %$r ) {
            my $v = $r->{$name};
            next unless $v->isa('RDF::Trine::Node::Resource');
            next
              if $v->uri_value !~
qr{^http://www.wikidata.org/(entity|prop)(/[^/]+)?/([QP]\d+|[a-z0-9]+)$};
            $r->{$name} = RDF::Trine::Node::Literal->new($3);
        }
        $r;
      }
      if $opt{ids};

    $iter = RDF::Trine::Iterator::smap( $_, $iter ) for @map;

    $iter;
}

my $rEntity = qr{^
    (https?://www\.wikidata\.org/
     (wiki|entity|wiki/Special:EntityData)/)?
    (?<id>(q|(Property:)?p)\d+)
$}ix;

my $rSitelink = qr{^https?://(
    [^.]+.(
        wikipedia | wiktionary | wikibooks | wikiquote | wikisource |
        wikinews | wikiversity | wikivoyage
    ) | (commons|species)\.wikimedia
)\.org}x;

# TODO: support --limit
if ( $action eq 'lookup' ) {
    require URI;
    my $result  = [];
    my $select  = 'SELECT ?id ?idLabel ?idDescription WHERE';
    my $service = <<SPARQL;
    SERVICE wikibase:label {
        bd:serviceParam wikibase:language "$language" .
    }
SPARQL
    my $query;
    my $varmap = sub {
        $_->{label}       = delete $_->{idLabel};
        $_->{description} = delete $_->{idDescription};
        $_;
    };
    my $output = {
        out => \*STDOUT,
        vars => [qw(id label description)]
    };
    while (<>) {
        chomp;
        s/^\s+|\s+$//g;
        next if /^$/;    # skip empty lines
        if ( $_ =~ $rEntity ) {
            my $uri = "http://www.wikidata.org/entity/" . uc( $+{id} );
            $query = "$select { BIND(<$uri> AS ?id) $service }";
        }
        elsif ( $_ =~ $rSitelink ) {
            my $url = URI->new($_);
            $url->scheme('https');
            $url   = $url->canonical->as_string;
            $query = "$select { <$url> schema:about ?id. $service }";
        }
        else {
            achtung("unknown identifier: $_");
            next;
        }
        $query = RDF::Query->new( expand_query($query) )->as_sparql;
        if ( $opt{'no-execute'} ) {
            say $query;
        }
        else {
            my $res = execute_query( $query, 'json' );
            if ( $res->{success} ) {
                my $iter = parse_response( $res->{content}, $varmap );
                unless ($output->{head}) {
                    $output->{head} = 1;
                    $format->{pre}->($output) if $format->{pre};
                }
                $format->{row}->( $output, $iter->next );
            }
        }
    }
    if ( !$opt{'no-execute'} and $format->{post} ) {
        $format->{post}->($output);
    }
    exit;
}

my ( $query, $sparql, $variables );
{
    local $/ = undef;
    if ( $opt{query} eq '-' ) {
        $query = <STDIN>;
    }
    elsif ( -f $opt{query} or $opt{query} !~ /[$\?\{]/ ) {
        open my $fh, '<', $opt{query}
          or die "failed to open file " . $opt{query};
        $query = <$fh>;
        open my $fh;
    }
    else {
        $query = $opt{query};
    }

    $query  = expand_query($query);
    $sparql = do {
        my $q = RDF::Query->new($query);
        unless ($q) {
            if ( $opt{'force'} ) {
                warn "SPARQL query seems invalid!\n";
                undef;
            }
            else {
                say STDERR "Invalid SPARQL query!";
                exit 1;
            }
        }
    };
}

if ($sparql) {
    $variables = [ map { $_->name } @{ $sparql->parsed->{variables} } ];
    $query = $sparql->as_sparql;
}

# LIMIT and OFFSET are always last so we can safely use regexp here
if ( $opt{limit} ) {
    $query =~ s/\n*LIMIT\s+\d+(\s+OFFSET\s+\d+)?\s*$/$1/sm;
    $query .= "\nLIMIT $opt{limit}";
}

if ( $opt{'no-execute'} ) {
    chomp $query;
    say $query;
    exit;
}

my $res =
  $opt{response}
  ? {
    content => do { local ( @ARGV, $/ ) = $opt{response}; <> },
    success => 1,
  }
  : execute_query( $query, $format eq 'xml' ? $format : 'json' );

if ( !$res->{success} ) {
    say STDERR $res->{content};
    exit 2;
}
my %FORMATS = (
    json   => ['json'],
    xml    => ['xml'],
    tsv    => [ json => 'tsv' ],
    csv    => [ json => 'csv' ],
    simple => [ json => 'simple' ],
    ldjson => [ json => 'ldjson' ],
    export => [ json => 'export' ],
);


if ( $format =~ /^(json|xml)$/ ) {
    if ( $opt{ids} ) {
        achtung("Option ids not supported for format $format yet!");
    } else {
        say $res->{content};
        exit;
    }
}

# convert SPARQL JSON response
my $iterator = parse_response( $res->{content} );

# convert result
my $vars = $variables // sort keys %{ $iterator->peek };
$format->emit( $iterator, $vars, \*STDOUT );

__END__

=head1 NAME

wdq - command line access to Wikidata Query Service

=head1 USAGE

Access L<Wikidata Query Service|https://query.wikidata.org/> via command line.
In C<query> mode (default), a SPARQL query is read from STDIN or option
C<--query>.  Default namespaces are and C<SELECT> clause are added if missing.
In C<lookup> mode, Wikidata entity ids, URLs, or Wikimedia project URLs are
read from STDIN to look up label and description.

=head1 EXAMPLES

  # get all parts of the solar system
  wdq -q '?c wdt:P361 wd:Q544' 
  wdq -q '{ ?c wdt:P361 wd:Q544 }'                # equivalent
  wdq -q 'SELECT * WHERE { ?c wdt:P361 wd:Q544 }' # equivalent

  # look up label and description 
  echo Q1 | wdq lookup

=head1 OPTIONS

=over

=item --query|-q QUERY

Query or query file (C<-> for STDIN as default)

=item --format|-f FORMAT

Output format. Supported formats include C<json>, C<xml>, C<tsv>, and C<csv>
SPARQL result format, C<simple> for flat JSON without language tags (default),
and C<ldjson> for line delimited flat json. For more flexible output options 
pipe to another tool such as L<jq|http://stedolan.github.io/jq/>,
L<miller|http://johnkerl.org/miller/>, and 
L<catmandu|https://github.com/LibreCat/Catmandu>. If L<Catmandu> is installed,
its exporters can directly be used with option C<export>.

=item --limit INTEGER

Add or override a LIMIT clause to limitate the number of results. Single-digit
options such as C<-1> can also be used to also set a limit.

=item --export EXPORTER

Use a L<Catmandu> exporter as output format, for instance C<XLS> (Excel) and
Markdown tables (C<Table>). The following produce same output:

  wdq --export Foo 
  wdq --format ldjson | catmandu convert to Foo

Use Catmandu config file (C<catmandu.yml>) to further configure export.

=item --ids|-i

Return Wikidata identifiers as strings instead of URIs (except for output
format C<xml> and C<json>).

=item --language|-g

Language to query labels and descriptions in. Set to the locale by default.
This option is currentl only used on lookup mode. 

=item --color|-C

By default output is colored if writing to a terminal. Disable this with
C<--no-color> or force color with C<--color> or C<-C>.

=item --api URL

SPARQL endpoint. Default value:
C<https://query.wikidata.org/bigdata/namespace/wdq/sparql>

=item --no-execute|-n

Don't execute query but show it in expanded form. Useful to validate and
pretty-print queries.

=item --help|-h|-?

Show usage help

=item --ontology

Show information about the RDF ontology used in Wikibase

=item --prefixes

Show the default prefixes

=item --no-default-prefixes

Don't add default namespace prefixes to the SPARQL query

=item --man

Show detailled manual

=item --version|-V

Show version if this script

=back

=head1 COPYRIGHT AND LICENSE

Copyright by Jakob Voss C<voss@gbv.de>

Based on a PHP script by Marius Hoch C<hoo@online.de>
at L<https://github.com/mariushoch/asparagus>.

Licensed under GPL 2.0+

=cut
