/*
** Copyright (C) 2006-2012 by Carnegie Mellon University.
**
** @OPENSOURCE_HEADER_START@
**
** Use of the SILK system and related source code is subject to the terms
** of the following licenses:
**
** GNU Public License (GPL) Rights pursuant to Version 2, June 1991
** Government Purpose License Rights (GPLR) pursuant to DFARS 252.227.7013
**
** NO WARRANTY
**
** ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER
** PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY
** PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN
** "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY
** KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT
** LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE,
** MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE
** OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT,
** SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY
** TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF
** WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES.
** LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF
** CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON
** CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE
** DELIVERABLES UNDER THIS LICENSE.
**
** Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie
** Mellon University, its trustees, officers, employees, and agents from
** all claims or demands made against them (and any related losses,
** expenses, or attorney's fees) arising out of, or relating to Licensee's
** and/or its sub licensees' negligent use or willful misuse of or
** negligent conduct or willful misconduct regarding the Software,
** facilities, or other rights or assistance granted by Carnegie Mellon
** University under this License, including, but not limited to, any
** claims of product liability, personal injury, death, damage to
** property, or violation of any laws or regulations.
**
** Carnegie Mellon University Software Engineering Institute authored
** documents are sponsored by the U.S. Department of Defense under
** Contract FA8721-05-C-0003. Carnegie Mellon University retains
** copyrights in all material produced under this contract. The U.S.
** Government retains a non-exclusive, royalty-free license to publish or
** reproduce these documents, or allow others to do so, for U.S.
** Government purposes only pursuant to the copyright license under the
** contract clause at 252.227.7013.
**
** @OPENSOURCE_HEADER_END@
*/

/*
 * rwsplit takes a sequence of input files and generates a set
 * of sample files from them.  Each file is a single sample.
 *
 * Sampling criteria currnently has the following parameters:
 *
 * --basename: The name of the stub file to write to
 * --ip-limit: how many addresses to contain in a sample
 * --flow-limit: how many flows to contain in a sample
 * --packet-limit: how many packets to contain in a sample
 * --byte-limit: how many bytes
 * --sample-ratio: specifies that 1/n flows should be taken for the
 *                 sample file.
 * --file-ratio: specifies that 1/n possible sample files will be used.
 * sample is going to progress through the data linearly, so if you're
 * going to use time, make sure you sort on time.
 */

#include <silk/silk.h>

RCSIDENT("$SiLK: rwsplit.c 372a8bc31d8a 2012-02-10 21:55:28Z mthomas $");

#include <silk/skstream.h>
#include <silk/utils.h>
#include <silk/skipset.h>
#include <silk/sksite.h>


/* LOCAL DEFINES AND TYPEDEFS */

/* where to write --help output */
#define USAGE_FH stdout

/* maximum number of output files; the file suffix is generated by
 * "%08u", so we can only have eight 9's worth of files */
#define MAX_OUTPUT_FILES 99999999

/* keep this in sync with the appOptionsEnum! */
typedef enum aggmode {
    AGGMODE_IPS, AGGMODE_FLOWS, AGGMODE_PKTS, AGGMODE_BYTES,
    /* none must be last */
    AGGMODE_NONE
} aggmode_t;


/* LOCAL VARIABLES */

/* index of files in argv */
static int arg_index;

/* basename of output files */
static char *out_basename = NULL;

/* current output file */
static skstream_t *rwios_out = NULL;

/* IPset in which to store unique IPs */
static skipset_t *ips = NULL;

/* the index of the output file are we writing */
static uint32_t output_ctr = 0;

/* max number of output files */
static uint32_t max_outputs = MAX_OUTPUT_FILES;

/* max ip/flow/packet/byte per file */
static uint64_t tag_limit = 0;

/* current count of ip/flow/packet/byte */
static uint64_t tag_current = 0;

/* how many records we need to read before we write one */
static uint32_t sample_ratio = 1;

/* how many records we've read on the way to reading 'sample_ratio'
 * records */
static uint32_t current_sample_count = 0;

/* instead of writing each file, write each 'file_ratio' file */
static uint32_t file_ratio = 1;

/* the thing we are aggregating */
static aggmode_t aggmode = AGGMODE_NONE;

/* whether the user specified the seed */
static int seed_specified = 0;

/* the compression method to use when writing the file.
 * sksiteCompmethodOptionsRegister() will set this to the default or
 * to the value the user specifies. */
static sk_compmethod_t comp_method;

/* handle to argc and argv used to write invocation into header of
 * output files. */
static int pargc;
static char **pargv;


/* OPTIONS SETUP */

typedef enum {
    /* the aggregate list--keep this set in sync with aggmode_t */
    OPT_IP_LIMIT, OPT_FLOW_LIMIT, OPT_PACKET_LIMIT, OPT_BYTE_LIMIT,
    OPT_BASENAME,
    OPT_SEED, OPT_SAMPLE_RATIO, OPT_FILE_RATIO,
    OPT_MAX_OUTPUTS
} appOptionsEnum;

/* value to subtract from appOptionsEnum to get a aggmode_t */
static int opt2agg_offset = OPT_IP_LIMIT;

static struct option appOptions[] = {
    {"ip-limit",     REQUIRED_ARG, 0, OPT_IP_LIMIT},
    {"flow-limit",   REQUIRED_ARG, 0, OPT_FLOW_LIMIT},
    {"packet-limit", REQUIRED_ARG, 0, OPT_PACKET_LIMIT},
    {"byte-limit",   REQUIRED_ARG, 0, OPT_BYTE_LIMIT},
    {"basename",     REQUIRED_ARG, 0, OPT_BASENAME},
    {"seed",         REQUIRED_ARG, 0, OPT_SEED},
    {"sample-ratio", REQUIRED_ARG, 0, OPT_SAMPLE_RATIO},
    {"file-ratio",   REQUIRED_ARG, 0, OPT_FILE_RATIO},
    {"max-outputs",  REQUIRED_ARG, 0, OPT_MAX_OUTPUTS},
    {0,0,0,0}           /* sentinel entry */
};

static const char *appHelp[] = {
    "IP address count at which to begin a new sample file",
    "Flow count at which to begin a new sample file",
    "Packet count at which to begin a new sample file",
    "Bytes count at which to begin a new sample file",
    "Basename to use for output sample files",
    "Value to use to seed the random number generator",
    ("Ratio of records read to number written in sample\n"
     "\tfile (e.g., 100 means to write 1 out of 100 records). Def. 1"),
    ("Ratio of sample file names generated to total number\n"
     "\twritten (e.g., 10 means 1 of every 10 files will be saved). Def. 1"),
    ("Maximum number of files to write to disk. Def. 999999999"),
    (char *)NULL
};



/* LOCAL FUNCTION PROTOTYPES */

static void appUsageLong(void);
static void appTeardown(void);
static void appSetup(int argc, char **argv);
static int  appOptionsHandler(clientData cData, int opt_index, char *opt_arg);
static int  processRec(const rwRec *input_rec);
static void newOutput(void);
static int  closeOutput(void);


/* FUNCTION DEFINITIONS */

/*
 *  appUsageLong();
 *
 *    Print complete usage information to USAGE_FH.  Pass this
 *    function to skOptionsSetUsageCallback(); skOptionsParse() will
 *    call this funciton and then exit the program when the --help
 *    option is given.
 */
static void appUsageLong(void)
{
#define USAGE_MSG                                                            \
    ("--basename=F --{ip|flow|packet|byte}-limit=N [SWITCHES] [FILES]\n"     \
     "\tSplit a stream of SiLK Flow records into a set of flow files that\n" \
     "\teach contain a subset of the records.\n")

    FILE *fh = USAGE_FH;
    int i;

    fprintf(fh, "%s %s", skAppName(), USAGE_MSG);
    fprintf(fh, "\nSPLITTING CRITERION:\n");
    for (i = opt2agg_offset;
         appOptions[i].name && i < (opt2agg_offset+AGGMODE_NONE);
         ++i)
    {
        fprintf(fh, "--%s %s. %s\n", appOptions[i].name,
                SK_OPTION_HAS_ARG(appOptions[i]), appHelp[i]);
    }

    fprintf(fh, "\nSWITCHES:\n");
    skOptionsDefaultUsage(fh);
    for (i = 0; appOptions[i].name; ++i) {
        if (i >= opt2agg_offset && i < (opt2agg_offset+AGGMODE_NONE)) {
            continue;
        }
        fprintf(fh, "--%s %s. %s\n", appOptions[i].name,
                SK_OPTION_HAS_ARG(appOptions[i]), appHelp[i]);
    }
    skOptionsNotesUsage(fh);
    sksiteCompmethodOptionsUsage(fh);
    sksiteOptionsUsage(fh);

    fprintf(fh, ("\nNote: The --basename and one of the --*-limit"
                 " switches are required.\n"));
}


/*
 *  appTeardown()
 *
 *    Teardown all modules, close all files, and tidy up all
 *    application state.
 *
 *    This function is idempotent.
 */
static void appTeardown(void)
{
    static int teardownFlag = 0;

    if (teardownFlag) {
        return;
    }
    teardownFlag = 1;

    closeOutput();

    if (ips) {
        skIPSetDestroy(&ips);
    }

    skOptionsNotesTeardown();
    skAppUnregister();
}


/*
 *  appSetup(argc, argv);
 *
 *    Perform all the setup for this application include setting up
 *    required modules, parsing options, etc.  This function should be
 *    passed the same arguments that were passed into main().
 *
 *    Returns to the caller if all setup succeeds.  If anything fails,
 *    this function will cause the application to exit with a FAILURE
 *    exit status.
 */
static void appSetup(int argc, char **argv)
{
    /* verify same number of options and help strings */
    assert((sizeof(appHelp)/sizeof(char *)) ==
           (sizeof(appOptions)/sizeof(struct option)));

    /* register the application */
    skAppRegister(argv[0]);
    skOptionsSetUsageCallback(&appUsageLong);

    /* hang onto argc and argv */
    pargc = argc;
    pargv = argv;

    /* register the options */
    if (skOptionsRegister(appOptions, &appOptionsHandler, NULL)
        || skOptionsNotesRegister(NULL)
        || sksiteCompmethodOptionsRegister(&comp_method)
        || sksiteOptionsRegister(SK_SITE_FLAG_CONFIG_FILE))
    {
        skAppPrintErr("Unable to register options");
        exit(EXIT_FAILURE);
    }

    /* parse the options */
    arg_index = skOptionsParse(argc, argv);
    if (arg_index < 0) {
        /* options parsing should print error */
        skAppUsage();           /* never returns */
    }

    /* try to load site config file; if it fails, we will not be able
     * to resolve flowtype and sensor from input file names */
    sksiteConfigure(0);

    /* arg_index is looking at first file name to process */
    if (arg_index == argc) {
        if (FILEIsATty(stdin)) {
            skAppPrintErr("No input files on command line and"
                          " stdin is connected to a terminal");
            skAppUsage();       /* never returns */
        }
    }

    /*
     * We now check for correctness.  This implies:
     * An aggregation mode has been chosen.
     * An output stub name has been specified.
     */
    if (aggmode == AGGMODE_NONE) {
        skAppPrintErr("No aggregation mode chosen; you must specify one");
        exit(EXIT_FAILURE);
    }
    if (out_basename == NULL) {
        skAppPrintErr("You must specify the output files' basename");
        exit(EXIT_FAILURE);
    }

    if (atexit(appTeardown) < 0) {
        skAppPrintErr("Unable to register appTeardown() with atexit()");
        appTeardown();
        exit(EXIT_FAILURE);
    }

    /* need to initialize the state */
    current_sample_count = sample_ratio;

    /* create IPset if required */
    if (aggmode == AGGMODE_IPS) {
        skIPSetCreate(&ips, 0);
    }

    return;  /* OK */
}


/*
 *  status = appOptionsHandler(cData, opt_index, opt_arg);
 *
 *    This function is passed to skOptionsRegister(); it will be called
 *    by skOptionsParse() for each user-specified switch that the
 *    application has registered; it should handle the switch as
 *    required---typically by setting global variables---and return 1
 *    if the switch processing failed or 0 if it succeeded.  Returning
 *    a non-zero from from the handler causes skOptionsParse() to return
 *    a negative value.
 *
 *    The clientData in 'cData' is typically ignored; 'opt_index' is
 *    the index number that was specified as the last value for each
 *    struct option in appOptions[]; 'opt_arg' is the user's argument
 *    to the switch for options that have a REQUIRED_ARG or an
 *    OPTIONAL_ARG.
 */
static int appOptionsHandler(
    clientData  UNUSED(cData),
    int         opt_index,
    char       *opt_arg)
{
    aggmode_t new_aggmode;
    int rv;

    switch ((appOptionsEnum)opt_index) {
      case OPT_IP_LIMIT:
      case OPT_BYTE_LIMIT:
      case OPT_PACKET_LIMIT:
      case OPT_FLOW_LIMIT:
        new_aggmode = (aggmode_t)(opt_index - opt2agg_offset);
        if (aggmode != AGGMODE_NONE) {
            if (aggmode == new_aggmode) {
                skAppPrintErr("The --%s switch was given multiple times",
                              appOptions[opt_index].name);
            } else {
                skAppPrintErr(("Can only give one aggregation strategy\n"
                               "\tBoth %s and %s specified"),
                              appOptions[aggmode+opt2agg_offset].name,
                              appOptions[opt_index].name);
            }
            return 1;
        }
        aggmode = new_aggmode;
        rv = skStringParseUint64(&tag_limit, opt_arg, 1, 0);
        if (rv) {
            goto PARSE_ERROR;
        }
        break;

      case OPT_BASENAME:
        if (out_basename) {
            skAppPrintErr("The --%s switch was given multiple times",
                          appOptions[opt_index].name);
            return 1;
        }
        out_basename = opt_arg;
        break;

      case OPT_SEED:
        {
            uint32_t t32 = 0;
            rv = skStringParseUint32(&t32, opt_arg, 0, 0);
            if (rv) {
                skAppPrintErr("Invalid %s '%s': %s",
                              appOptions[opt_index].name, opt_arg,
                              skStringParseStrerror(rv));
                return 1;
            }
            srandom((unsigned int)t32);
            seed_specified = 1;
        }
        break;

      case OPT_SAMPLE_RATIO:
        rv = skStringParseUint32(&sample_ratio, opt_arg,
                                 1, (UINT32_MAX / sizeof(rwRec)));
        if (rv) {
            goto PARSE_ERROR;
        }
        break;

      case OPT_FILE_RATIO:
        rv = skStringParseUint32(&file_ratio, opt_arg, 1, 0);
        if (rv) {
            goto PARSE_ERROR;
        }
        break;

      case OPT_MAX_OUTPUTS:
        rv = skStringParseUint32(&max_outputs, opt_arg,
                                 1, MAX_OUTPUT_FILES);
        if (rv) {
            goto PARSE_ERROR;
        }
    }

    return 0;  /* OK */

  PARSE_ERROR:
    skAppPrintErr("Invalid %s '%s': %s",
                  appOptions[opt_index].name, opt_arg,
                  skStringParseStrerror(rv));
    return 1;
}


static int closeOutput(void)
{
    int rv = 0;

    if (rwios_out) {
        rv = skStreamClose(rwios_out);
        if (rv) {
            skStreamPrintLastErr(rwios_out, rv, &skAppPrintErr);
        }
        skStreamDestroy(&rwios_out);
    }
    return rv;
}


/*
 *  newOutput();
 *
 *    Create a new data file using the basename and allocates a handle
 *    to it as the current file.
 */
static void newOutput(void)
{
    static uint32_t sample_die_roll = 0;
    char datafn[PATH_MAX];
    int rv;

    if (file_ratio != 1) {
        if (0 == (output_ctr % file_ratio)) {
            sample_die_roll = (uint32_t)(random() % file_ratio);
        }
        if ((output_ctr % file_ratio) != sample_die_roll) {
            ++output_ctr;
            return;
        }
    }

    /* have we written the maximum number of output files? */
    if (max_outputs == 0) {
        exit(EXIT_SUCCESS);
    }
    --max_outputs;

    /* create new file name, open it, write the headers */
    snprintf(datafn, sizeof(datafn),  ("%s.%08" PRIu32 ".rwf"),
             out_basename, output_ctr);
    if ((rv = skStreamCreate(&rwios_out, SK_IO_WRITE, SK_CONTENT_SILK_FLOW))
        || (rv = skStreamBind(rwios_out, datafn))
        || (rv = skStreamSetCompressionMethod(rwios_out, comp_method))
        || (rv = skOptionsNotesAddToStream(rwios_out))
        || (rv = skHeaderAddInvocation(skStreamGetSilkHeader(rwios_out),
                                       1, pargc, pargv))
        || (rv = skStreamOpen(rwios_out))
        || (rv = skStreamWriteSilkHeader(rwios_out)))
    {
        skStreamPrintLastErr(rwios_out, rv, &skAppPrintErr);
        skStreamDestroy(&rwios_out);
        exit(EXIT_FAILURE);
    }

    ++output_ctr;
}


/*
 *  int processRec(rwRec *rwrec)
 *
 *    Given a single record, it updates its count and states and
 *    determines whether or not it is time to move onto the next value
 *    in the dataset.
 */
static int processRec(const rwRec *rwrec)
{
    static uint32_t grab_index = 0;
    skipaddr_t ipaddr;
    int reset_status;
    int rv;

    reset_status = 0;

    /* if we are not processing every record, decide whether to
     * process the current record. */
    if (sample_ratio != 1) {
        if (current_sample_count == sample_ratio) {
            current_sample_count = 0;
            /* figure out which record of the next sample_ratio
             * records to process */
            grab_index = 1 + random() % sample_ratio;
        }
        ++current_sample_count;
        if (grab_index != current_sample_count) {
            return 0;
        }
    }

    /* open the output file if this is the first record.  this ensures
     * we only open output files when we have data to write to
     * them. */
    if (0 == tag_current) {
        newOutput();
    }

    if (rwios_out) {
        rv = skStreamWriteRecord(rwios_out, rwrec);
        if (SKSTREAM_ERROR_IS_FATAL(rv)) {
            skStreamPrintLastErr(rwios_out, rv, &skAppPrintErr);
            skStreamDestroy(&rwios_out);
            exit(EXIT_FAILURE);
        }
    }

    /*
     * What's going on here.  This routine actually determine when an
     * element of the partition is complete and we can safely go on to
     * the next element.  To do so, we update an internal count
     * (tag_current) with whatever values we got from the update.
     * The increase is determined by the record and the aggregation
     * mode.  Once we have determined that the updated value exceeds our
     * per-partition limit (tag_limit), we close the file and move
     * onto the next one.
     */
    switch (aggmode) {
      case AGGMODE_IPS:
        rwRecMemGetSIP(rwrec, &ipaddr);
        if (!skIPSetCheckAddress(ips, &ipaddr)) {
            skIPSetInsertAddress(ips, &ipaddr, 0);
            tag_current++;
        }
        rwRecMemGetDIP(rwrec, &ipaddr);
        if (!skIPSetCheckAddress(ips, &ipaddr)) {
            skIPSetInsertAddress(ips, &ipaddr, 0);
            tag_current++;
        }
        if (tag_current >= tag_limit) {
            reset_status = 1;
            /* reset tree */
            skIPSetRemoveAll(ips);
        }
        break;

      case AGGMODE_FLOWS:
        ++tag_current;
        if (tag_current >= tag_limit) {
            reset_status = 1;
        }
        break;

      case AGGMODE_PKTS:
        tag_current += rwRecGetPkts(rwrec);
        if (tag_current >= tag_limit) {
            reset_status = 1;
        }
        break;

      case AGGMODE_BYTES:
        tag_current += rwRecGetBytes(rwrec);
        if (tag_current >= tag_limit) {
            reset_status = 1;
        }
        break;

      case AGGMODE_NONE:
        skAbortBadCase(aggmode);
    }

    if (reset_status) {
        /* close current file */
        if (closeOutput()) {
            exit(EXIT_FAILURE);
        }
        tag_current = 0;
    }

    return 0;
}


/*
 *  ok = appNextInput(argc, argv, &rwios);
 *
 *    Open the next input file from the command line or the standard
 *    input if no files were given on the command line and set 'rwios'
 *    to the stream.  Return 0 if a file was opened.  Return 1 if all
 *    files have been processed.  Return -1 if there was an error
 *    opening the file.
 */
static int appNextInput(int argc, char **argv, skstream_t **rwios)
{
    static int initialized = 0;
    const char *fname = NULL;
    int rv;

    if (!initialized) {
        initialized = 1;
    } else if (arg_index == argc) {
        /* no more input */
        return 1;
    }

    if (arg_index == argc) {
        fname = "stdin";
    } else {
        /* get current file and prepare to get next */
        fname = argv[arg_index];
        ++arg_index;
    }

    /* create rwios and open file */
    rv = skStreamOpenSilkFlow(rwios, fname, SK_IO_READ);
    if (rv) {
        skStreamPrintLastErr(*rwios, rv, &skAppPrintErr);
        skStreamDestroy(rwios);
        return -1;
    }

    /* When aggregating by IPs, we only handle IPv4 flows */
    if (aggmode == AGGMODE_IPS) {
        skStreamSetIPv6Policy(*rwios, SK_IPV6POLICY_ASV4);
    }

    return 0;
}


int main(int argc, char **argv)
{
    struct timeval tv;
    rwRec in_rec;
    skstream_t *rwios_in;
    int ret_val = 0;
    int rv;

    appSetup(argc, argv);                       /* never returns on error */

    if (!seed_specified) {
        gettimeofday(&tv, NULL);
        srandom((unsigned int) ((tv.tv_sec + tv.tv_usec) / getpid()));
    }

    /* for all inputs, read all records */
    while ((rv = appNextInput(argc, argv, &rwios_in)) == 0) {
        while ((rv = skStreamReadRecord(rwios_in, &in_rec)) == SKSTREAM_OK) {
            processRec(&in_rec);
        }
        if (SKSTREAM_ERR_EOF != rv) {
            skStreamPrintLastErr(rwios_in, rv, &skAppPrintErr);
            ret_val = 1;
        }
        skStreamDestroy(&rwios_in);
    }
    if (rv == -1) {
        ret_val = 1;
    }

    if (closeOutput()) {
        exit(EXIT_FAILURE);
    }

    return ret_val;
}


/*
** Local Variables:
** mode:c
** indent-tabs-mode:nil
** c-basic-offset:4
** End:
*/
