#! /usr/bin/perl -w
#
#######################################################################
# Copyright (C) 2004-2008 by Carnegie Mellon University.
#
# @OPENSOURCE_HEADER_START@
#
# Use of the SILK system and related source code is subject to the terms
# of the following licenses:
#
# GNU Public License (GPL) Rights pursuant to Version 2, June 1991
# Government Purpose License Rights (GPLR) pursuant to DFARS 252.225-7013
#
# NO WARRANTY
#
# ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER
# PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY
# PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN
# "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY
# KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT
# LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE
# OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT,
# SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY
# TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF
# WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES.
# LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF
# CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON
# CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE
# DELIVERABLES UNDER THIS LICENSE.
#
# Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie
# Mellon University, its trustees, officers, employees, and agents from
# all claims or demands made against them (and any related losses,
# expenses, or attorney's fees) arising out of, or relating to Licensee's
# and/or its sub licensees' negligent use or willful misuse of or
# negligent conduct or willful misconduct regarding the Software,
# facilities, or other rights or assistance granted by Carnegie Mellon
# University under this License, including, but not limited to, any
# claims of product liability, personal injury, death, damage to
# property, or violation of any laws or regulations.
#
# Carnegie Mellon University Software Engineering Institute authored
# documents are sponsored by the U.S. Department of Defense under
# Contract F19628-00-C-0003. Carnegie Mellon University retains
# copyrights in all material produced under this contract. The U.S.
# Government retains a non-exclusive, royalty-free license to publish or
# reproduce these documents, or allow others to do so, for U.S.
# Government purposes only pursuant to the copyright license under the
# contract clause at 252.227.7013.
#
# @OPENSOURCE_HEADER_END@
#######################################################################
#
#  rwccmapbuild
#
#    Generate a SiLK-style prefix map (pmap) file from either an
#    encoded GeoIP data file or a CSV GeoIP data file.
#
#    John McClary Prevost
#    December 2004 / June 2005
#
#    Mark Thomas
#    April 2007
#
#######################################################################
#  RCSIDENT("$SiLK: rwgeoip2ccmap 13074 2008-11-20 20:28:45Z mthomas $")
#######################################################################

$^W = 1;  # enable warnings
use strict;
use Getopt::Long;
use Pod::Usage;

use vars qw(@nodes $csv_input $unknown_leaf);
@nodes = ();


parse_options();
if ($csv_input) {
    handle_csv_input();
} else {
    handle_encoded_input();
}


output_tree();

exit 0;


#######################################################################


sub output_tree
{
    # Output is binary
    binmode STDOUT;

    print(pack('C8', 0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x25, 0x01, 0x00));
    print(pack('V', scalar @nodes));
    for my $i ( 0 .. $#nodes ) {
        print(pack('VV', @{$nodes[$i]}));
    }
    close STDOUT
        or die "Cannot close STDOUT: $!";
}


sub handle_encoded_input
{
    # input is binary
    binmode STDIN;

    my $COUNTRY_BEGIN = 0x00FFFF00;
    my @countries = qw(
                       -- ap eu ad ae af ag ai al am an ao aq ar as at
                       au aw az ba bb bd be bf bg bh bi bj bm bn bo br
                       bs bt bv bw by bz ca cc cd cf cg ch ci ck cl cm
                       cn co cr cu cv cx cy cz de dj dk dm do dz ec ee
                       eg eh er es et fi fj fk fm fo fr fx ga gb gd ge
                       gf gh gi gl gm gn gp gq gr gs gt gu gw gy hk hm
                       hn hr ht hu id ie il in io iq ir is it jm jo jp
                       ke kg kh ki km kn kp kr kw ky kz la lb lc li lk
                       lr ls lt lu lv ly ma mc md mg mh mk ml mm mn mo
                       mp mq mr ms mt mu mv mw mx my mz na nc ne nf ng
                       ni nl no np nr nu nz om pa pe pf pg ph pk pl pm
                       pn pr ps pt pw py qa re ro ru rw sa sb sc sd se
                       sg sh si sj sk sl sm sn so sr st sv sy sz tc td
                       tf tg th tj tk tm tn to tp tr tt tv tw tz ua ug
                       um us uy uz va vc ve vg vi vn vu wf ws ye yt yu
                       za zm zr zw a1 a2 o1 );

    my $in;
    while ( read(STDIN, $in, 6) == 6 ) {
        my ($l0, $l1, $l2, $r0, $r1, $r2) = unpack 'C6', $in;
        if ( (defined $l0) && (defined $l1) && (defined $l2) ||
             (defined $r0) && (defined $r1) && (defined $r2) )
        {
            my $left = ($l2 << 16) + ($l1 << 8) + $l0;
            my $right = ($r2 << 16) + ($r1 << 8) + $r0;
            if ( $left >= $COUNTRY_BEGIN ) {
                $left -= $COUNTRY_BEGIN;
                $left = leaf($countries[$left]);
            }
            if ( $right >= $COUNTRY_BEGIN ) {
                $right -= $COUNTRY_BEGIN;
                $right = leaf($countries[$right]);
            }
            push @nodes, [$left, $right];
        }
    }
}


sub handle_csv_input
{
    $unknown_leaf = leaf('--');

    $nodes[0] = [$unknown_leaf, $unknown_leaf];

# Note that this can be done with linear allocation in an array:
# Allocated nodes are never destroyed, only modified and walked.
# It is possible for some nodes to be orphaned during the creation
# process, but only if the input data is poorly formed (overlapping
# regions.)

    while (<STDIN>) {
        chomp;
        s/\"//g;                # Remove '"' characters surrounding values
        my ($low_ip, $high_ip, $low_num, $high_num, $c_code, $c_name)
            = split /,/, $_, 6;
        my $low_bits = unpack('B*', (pack 'N', $low_num));
        my $high_bits = unpack('B*', (pack 'N', $high_num));
        add_to_tree($low_bits, $high_bits, lc $c_code, 0, 0, '');
    }
}


sub add_to_tree
{
    # In order for us to be here:
    #   This node's range must overlap with the range of [$low_bits,$high_bits]
    #   This node's range must *not* be contained within [$low_bits,$high_bits]
    my ($low_bits, $high_bits, $c_code, $node, $bit, $bits) = @_;

    # Does the left subtree overlap [$low_bits,$high_bits]?
    if ( substr($low_bits, $bit, 1) eq '0' ) {

        # Is the left subtree completely contained in [$low_bits,$high_bits]?
        #   (Is the low end the low end of the left subtree?  And, is the
        #   high end either the high end of the left subtree or somewhere in
        #   the right subtree?)
        if ( (substr($low_bits, $bit+1, 31-$bit) eq ('0' x (31-$bit))) &&
            ((substr($high_bits, $bit, 1) eq '1') ||
             (substr($high_bits, $bit+1, 31-$bit) eq ('1' x (31-$bit)))) )
        {
            # Left subtree is completely contained, set left subtree to country

            # Check first to warn of overlap:
            if ( $nodes[$node][0] ne $unknown_leaf ) {
                warn "Overlapping range: [$low_bits, $high_bits] x $bits\n";
            }

            # We're done--the left side is mapped entirely to this value
            $nodes[$node][0] = leaf($c_code);

        } else {
            # Left subtree overlaps but is not completely contained

            # Is the left subtree a leaf?
            if ( isleaf($nodes[$node][0]) ) {
                # It is: create a new node.  Initialize that node with both
                # branches pointing at the old leaf value.
                # Note: This should be $unknown_leaf with well-formed data.
                if ( $nodes[$node][0] ne $unknown_leaf ) {
                    warn("Overlapping range: ",
                         "[$low_bits, $high_bits] x ${bits}0\n");
                }
                push @nodes, [$nodes[$node][0], $nodes[$node][0]];
                $nodes[$node][0] = $#nodes;
            }

            # Either way, there's a node there now, recurse down it.
            # If $high_bits is outside the left tree, chop it to be at
            # the right edge
            if ( substr($high_bits, $bit, 1) eq '1' ) {
                my $chopped = $low_bits;
                substr($chopped, $bit+1, 31-$bit, '1' x (31-$bit));
                add_to_tree($low_bits, $chopped, $c_code,
                            $nodes[$node][0], $bit+1, $bits."0");
            } else {
                add_to_tree($low_bits, $high_bits, $c_code,
                            $nodes[$node][0], $bit+1, $bits."0");
            }
        }
    }

    # Mirror image on right side, really

    # Does the right subtree overlap [$low_bits,$high_bits]
    if ( substr($high_bits, $bit, 1) eq '1' ) {

        # Is the right subtree completely contained in [$low_bits,$high_bits]?
        #   (Is the high end the high end of the right subtree?  And, is
        #   the low end either the low end of the right subtree or
        #   somewhere in the left subtree?)
        if ( (substr($high_bits, $bit+1, 31-$bit) eq ('1' x (31-$bit))) &&
            ((substr($low_bits, $bit, 1) eq '0') ||
             (substr($low_bits, $bit+1, 31-$bit) eq ('0' x (31-$bit)))) )
        {
            # Right subtree is completely contained, set right subtree
            # to country.

            # Check first to warn of overlap
            if ( $nodes[$node][1] ne $unknown_leaf ) {
                warn("Overlapping range: ",
                     "[$low_bits, $high_bits] x $bits\n");
            }

            # We're done--the right side is mapped entirely to this value
            $nodes[$node][1] = leaf($c_code);

        } else {
            # Right subtree overlaps but is not completely contained

            # Is the right subtree a leaf?
            if ( isleaf($nodes[$node][1]) ) {
                # It is: create a new node.  Initialize that node with both
                # branches pointing at the old leaf value.
                # Note: this should be $unknown_leaf with well-formed data.
                if ( $nodes[$node][1] ne $unknown_leaf ) {
                    warn("Overlapping range: ",
                         "[$low_bits, $high_bits] x ${bits}1\n");
                }
                push @nodes, [$nodes[$node][1], $nodes[$node][1]];
                $nodes[$node][1] = $#nodes;
            }

            # Either way, there's a node there now, recurse down it.
            # If $low_bits is outside the right tree, chop it to be at
            # the left edge
            if ( substr($low_bits, $bit, 1) eq '0' ) {
                my $chopped = $high_bits;
                substr($chopped, $bit+1, 31-$bit, '0' x (31-$bit));
                add_to_tree($chopped, $high_bits, $c_code,
                            $nodes[$node][1], $bit+1, $bits."1");
            } else {
                add_to_tree($low_bits, $high_bits, $c_code,
                            $nodes[$node][1], $bit+1, $bits."1");
            }
        }
    }
}


sub leaf
{
    my $cc = shift;
    return 0x80000000 + ord(substr($cc,0,1)) * 256 + ord(substr($cc,1,1));
}


sub isleaf
{
    my $val = shift;
    return $val & 0x80000000;
}


sub parse_options
{
    # get basename of script
    my $name = $0;
    $name =~ s{.*/}{};

    # local vars
    my ($help, $man, $version);

    # process options.  see "man Getopt::Long"
    GetOptions('help|?' => \$help,
               'man' => \$man,
               'version' => \$version,

               'csv-input' => sub { $csv_input = 1 },
               'encoded-input' => sub { $csv_input = 0; },
               )
        or pod2usage(2);

    # help?
    if ($help) {
        pod2usage(-exitval => 0);
    }
    if ($man) {
        pod2usage(-exitval => 0, -verbose => 2);
    }

    if ($version) {
        dump_version();
        exit 0;
    }

    unless (defined $csv_input) {
        die "$name: Must specify the type of input: --csv or --encoded\n";
    }

    # check for arguments and verify redirection
    if (@ARGV) {
        die "$name takes no arguments.\n";
    }
    if (-t STDOUT) {
        die "$name: stdout must not be connected to a terminal\n";
    }
    if (-t STDIN) {
        if ($csv_input) {
            warn "$name: reading CSV data from the terminal\n";
        } else {
            die "$name: stdin must not be connected to a terminal\n";
        }
    }
}
# parse_options


sub dump_version
{
    # get basename of script
    my $name = $0;
    $name =~ s{.*/}{};

    my $vers = ((split " ", q$SiLK: rwgeoip2ccmap 13074 2008-11-20 20:28:45Z mthomas $)[-4]) || "UNKNOWN";

    print <<EOF;
$name: Part of SiLK.  Build r$vers.
Copyright (C) 2001-2007 by Carnegie Mellon University
GNU Public License (GPL) Rights pursuant to Version 2, June 1991
Government Purpose License Rights (GPLR) pursuant to DFARS 252.225-7013
Send bug reports, feature requests, and comments to silk-help\@cert.org
EOF
}


1;
__END__

=head1 NAME

B<rwgeoip2ccmap> - Create a country code prefixmap from a GeoIP data file

=head1 SYNOPSIS

  unzip -p GeoIPCountryCSV.zip | \
      rwgeoip2ccmap --csv-input > country_codes.pmap

  gzip -d -c GeoIP.dat.gz | \
      rwgeoip2ccmap --encoded-input > country_codes.pmap



=head1 DESCRIPTION

Prefixmaps (pmaps) provide a way to map field values to string labels
based on a user-defined map file.  The country code prefixmap,
typically named F<country_codes.pmap>, is a special prefixmap that
maps an IP address to a two-letter country code.  It uses the country
codes defined by the Internet Assigned Numbers Authority
(L<http://www.iana.org/root-whois/index.html>).

The country code prefixmap is used by the B<ccfilter(3)> plug-in to
partition by, count by, sort by, and display the country code in SiLK
Flow files.  The B<rwip2cc(1)> command can use the map file to display
the country code for textual IP addresses.

The country code prefixmap is based on the GeoIP Country(R) or free
GeoLite database created by MaxMind(R) and available from
L<http://www.maxmind.com/>.  The GeoLite database is a free evaluation
copy that is C<98% accurate> which is updated monthly.  MaxMind sells
the GeoIP Country database which has over C<99% accuracy> and is
updated weekly.

The database comes in two forms:

=over 4

=item F<GeoIPCountryCSV.zip>

as a compressed (B<zip>) textual file containing the IP range, country
name, and county code in a comma separated value (CSV) form

=item F<GeoIP.dat.gz>

as a compressed (B<gzip>) binary file containing an encoded form of the
IP address range and country code

=back

=head1 OPTIONS

Option names may be abbreviated if the abbreviation is unique or is an
exact match for an option.  A parameter to an option may be specified
as B<--arg>=I<param> or B<--arg> I<param>, though the first form is
required for options that take optional parameters.

One of the following switches is required:

=over 4

=item B<--csv-input>

Treat the standard input as a textual stream containing the CSV (comma
separated value) GeoIP country code data.

=item B<--encoded-input>

Treat the standard input as a binary stream the encoded GeoIP country
code data.

=back

=head1 EXAMPLES

Obtain your copy of the MaxMind GeoIP Country database, either the
comma separated value version or the binary version
(F<GeoIP.dat.gz>).  To create the F<country_codes.pmap> data file,
run

=over 4

=item *

For the CSV version:

  $ unzip -p GeoIPCountryCSV.zip | \
        rwgeoip2ccmap --csv-input > country_codes.pmap

=item *

For the binary data format:

  $ gzip -d -c GeoIP.dat.gz | \
        rwgeoip2ccmap --encoded-input > country_codes.pmap

=back

Once you have created the F<country_codes.pmap> file, you will need to
copy it to F<$SILK_PATH/share/silk/country_codes.pmap> so that the
B<ccfilter> plug-in will use it.

=head1 SEE ALSO

B<ccfilter(3)>, B<rwip2cc(1)>

=cut

Local Variables:
indent-tabs-mode:nil
End:
