/*
 *  Copyright 2007-2025 Carnegie Mellon University
 *  See license information in LICENSE.txt.
 */
/*
 *  payloadScanner.c
 *
 *  these functions read the playload scanning rules and then also
 *  have a function to be called to process those rules
 *
 *  ------------------------------------------------------------------------
 *  Authors: Chris Inacio
 *  ------------------------------------------------------------------------
 *  @DISTRIBUTION_STATEMENT_BEGIN@
 *  YAF 2.18
 *
 *  Copyright 2025 Carnegie Mellon University.
 *
 *  NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
 *  INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
 *  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
 *  AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF FITNESS FOR
 *  PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF
 *  THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF
 *  ANY KIND WITH RESPECT TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT
 *  INFRINGEMENT.
 *
 *  Licensed under a GNU GPL 2.0-style license, please see LICENSE.txt or
 *  contact permission@sei.cmu.edu for full terms.
 *
 *  [DISTRIBUTION STATEMENT A] This material has been approved for public
 *  release and unlimited distribution.  Please see Copyright notice for
 *  non-US Government use and distribution.
 *
 *  This Software includes and/or makes use of Third-Party Software each
 *  subject to its own license.
 *
 *  DM25-1281
 *  @DISTRIBUTION_STATEMENT_END@
 *  ------------------------------------------------------------------------
 */



#define _YAF_SOURCE_
#include <yaf/autoinc.h>

#if YAF_ENABLE_APPLABEL

#include <ctype.h>
#include <ltdl.h>
#include <search.h>
#include <stdlib.h>
#include "portHash.h"
#include "payloadScanner.h"

#ifdef YAF_ENABLE_HOOKS
#include <yaf/yafhooks.h>
#endif

#ifndef YFDEBUG_APPLABEL
#define YFDEBUG_APPLABEL 0
#endif
#ifndef YFDEBUG_APPRULES_PARSER
#define YFDEBUG_APPRULES_PARSER 0
#endif

#define YAF_SEARCH_PATH "/usr/local/lib/yaf"
#define ALT_SEARCH_PATH "/usr/lib/yaf"
#define ALT_SEARCH_PATH64 "/usr/lib64/yaf"

/*
 *  The next statement defines the following:
 *
 *  typedef uint16_t (*ycScannerPlugin_fn)(
 *      int            argc,
 *      char          *argv[],
 *      const uint8_t *payload,
 *      unsigned int   payloadSize,
 *      yfFlow_t      *flow,
 *      yfFlowVal_t   *val);
 */
typedef YC_SCANNER_PROTOTYPE( (*ycScannerPlugin_fn) );

typedef struct payloadScanRule_st {
    uint16_t   payloadLabelValue;
    enum { REGEX, PLUGIN, EMPTY, SIGNATURE } ruleType;
    union {
        struct {
            pcre        *scannerExpression;
            pcre_extra  *scannerExtra;
        } regexFields;
        struct {
            /* ala argc, argv */
            int                  numArgs;
            char               **pluginArgs;
            lt_dlhandle          handle;
            ycScannerPlugin_fn   func;
        } pluginArgs;
    } ruleArgs;
} payloadScanRule_t;



/**
 *
 * file globals
 *
 */
static payloadScanRule_t ruleTable[MAX_PAYLOAD_RULES];
static unsigned int      numPayloadRules = 0;
static payloadScanRule_t sigTable[MAX_PAYLOAD_RULES];
static unsigned int      numSigRules = 0;


/*    Formats an error message when pcre_compile() fails. Full description
 *    below */
static GString *
ycDisplayScannerRuleError(
    const char    *descrip,
    const char    *errorMsg,
    const char    *regex,
    int            errorPos);

/*    Splits 'sampleString' into an array of words. Full description below */
static char **
ycChunkString(
    pcre        *wordSplitter,
    const char  *sampleString,
    int         *argNum);

#if YFDEBUG_APPLABEL
/*    Prints the payload.  Full description below */
static void
ycPayloadPrinter(
    const uint8_t *payloadData,
    unsigned int   payloadSize,
    unsigned int   numPrint,
    GString       *gstr);
#endif /* if YFDEBUG_APPLABEL */


/**
 * initializeScanRules
 *
 * this reads in the rules definition file for identifying the playload.
 * It compiles the regular
 * expressions and loads in the dynamic libraries as defined for later use
 *
 * @param scriptFile a file pointer to the rule definition file
 *
 */
gboolean
ycInitializeScanRules(
    FILE    *scriptFile,
    GError **err)
{
    /*
     * for every rule that is "imagined" can be returned on a single call to
     * pcre_exec, you need to multiply that number by 6 for the correct number
     * of "vector" entries (and because of pcre limitation should be a
     * multiple of 3)
     */
#define NUM_SUBSTRING_VECTS 60
    const char  *errorString;
    int          errorPos;
    GString     *eString;
    pcre        *ruleScanner;
    pcre        *pluginScanner;
    pcre        *commentScanner;
    pcre        *signatureScanner;
    pcre        *wordScanner;
#if YFDEBUG_APPRULES_PARSER
    GString     *str = NULL;
#endif

    /* A comment must be on its own line and goes to the end of the line */
    const char   commentScannerExp[] = "^\\s*#[^\\n]*\\n";

    /* Load a plugin: 'label' NUMBER 'plugin' FUNCTION_NAME */
    const char   pluginScannerExp[] =
        "^[[:space:]]*label[[:space:]]+([[:digit:]]+)"
        "[[:space:]]+plugin[[:space:]]*([^[:space:]\\n].*)\\n";

    /* Define a regex rule: 'label' NUMBER 'regex' REGEX */
    const char   ruleScannerExp[] =
        "^[[:space:]]*label[[:space:]]+([[:digit:]]+)"
        "[[:space:]]+regex[[:space:]]*([^\\n].*)\\n";

    /* Define a signature rule: 'label' NUMBER 'signature' REGEX */
    const char   signatureScannerExp[] =
        "^[[:space:]]*label[[:space:]]+([[:digit:]]+)"
        "[[:space:]]+signature[[:space:]]*([^\\n].*)\\n";

    /* The resulting regex is passed to ycChunkString() to split a plugin's
     * argument list into tokens */
    const char   wordScannerExp[] = "[^ \t\n]+";

    int          rc;
    int          substringVects[NUM_SUBSTRING_VECTS];
    char         lineBuffer[LINE_BUF_SIZE];
    int          readLength;
    char        *captString;
    unsigned int bufferOffset = 0;
    int          currentStartPos = 0;
    int          loop;
    char        *ltdl_lib_path = NULL;

    /* first mark all plugin entries as empty, just in case */
    for (loop = 0; loop < MAX_PAYLOAD_RULES; loop++) {
        ruleTable[loop].ruleType = EMPTY;
    }

    /* initialize the hash table */
    ycPortHashInitialize();

    /* initialize the dynamic loader library */
    rc = lt_dlinit();
    if (0 != rc) {
        g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_IMPL,
                    "Error initializing the dynamic loader library: \"%s\"",
                    lt_dlerror());
        return FALSE;
    }

    /* if LTDL_LIBRARY_PATH is set - add this one first */
    ltdl_lib_path = getenv("LTDL_LIBRARY_PATH");
    if (ltdl_lib_path) {
        lt_dladdsearchdir(ltdl_lib_path);
    }

#ifdef YAF_APPLABEL_PATH
    /* add the applabel path based on libdir at build time */
    lt_dladdsearchdir(YAF_APPLABEL_PATH);
#else
    /* add /usr/local/lib/yaf to path since libtool can never find it */

    lt_dladdsearchdir(YAF_SEARCH_PATH);
    lt_dladdsearchdir(ALT_SEARCH_PATH);
    lt_dladdsearchdir(ALT_SEARCH_PATH64);
#endif /* ifdef YAF_APPLABEL_PATH */

    /* create the hash table for library modules to library handle names */
    if (!hcreate((MAX_PAYLOAD_RULES * 20) / 100)) {
        g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_IMPL,
                    "Could not create load module hash table (%d)",
                    errno);
        return FALSE;
    }

    /* take all of the rules needed to parse the rule file and compile them
     * into a form that the regular expression engine can deal with */
    if (!ycPcreCompile(&ruleScanner, ruleScannerExp, PCRE_MULTILINE, err)) {
        g_prefix_error(err, "Could not build the rule scanner: ");
        return FALSE;
    }

    if (!ycPcreCompile(&pluginScanner, pluginScannerExp, PCRE_MULTILINE, err)) {
        g_prefix_error(err, "Could not build the plugin scanner: ");
        return FALSE;
    }

    if (!ycPcreCompile(&commentScanner, commentScannerExp,
                       PCRE_MULTILINE, err))
    {
        g_prefix_error(err, "Could not build the comment scanner: ");
        return FALSE;
    }

    if (!ycPcreCompile(&signatureScanner, signatureScannerExp,
                       PCRE_MULTILINE, err))
    {
        g_prefix_error(err, "Could not build the signature scanner: ");
        return FALSE;
    }

    if (!ycPcreCompile(&wordScanner, wordScannerExp, 0, err)) {
        g_prefix_error(err, "Failed to compile the word scanner: ");
        return FALSE;
    }

#if YFDEBUG_APPRULES_PARSER
    g_debug("\n============  DEBUGGING THE APPRULES PARSER  ============\n");
    str = g_string_sized_new(4096);
#endif

    /*
     * This is the loop that does the lion's share of the rule file
     * processing.  First read a hunk of the rule file, (this may include
     * multiple lines of stuff).
     *
     * This gets a little bit ugly, there are a number of issues that have to
     * handled: first, because there may be multiple lines (which is in fact
     * likely) it has to be able to work its way through the buffer, a single
     * pass of the buffer through the pcre engine simply won't cut it.  At the
     * end, it is possible to have part of a line, when this happens, it needs
     * to copy the leftover part of the read into the front of the buffer, and
     * then read again to fill in the rest of line.  (This detail limits a
     * single line to LINE_BUF_SIZE size.)
     */
    do {
        readLength =
            fread(lineBuffer + bufferOffset, 1, LINE_BUF_SIZE - 1 -
                  bufferOffset,
                  scriptFile);
        if (0 == readLength) {
            if (ferror(scriptFile)) {
                g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_IO,
                            "Could not read the rule file: %s",
                            strerror(errno));
#if YFDEBUG_APPRULES_PARSER
                g_string_free(str, TRUE);
#endif
                return FALSE;
            }
            break;
        }

        /* fread only returns how much it read from the file - need to add
         * extra we put in the buffer from last read, if any */
        readLength += bufferOffset;

        /*
         * substringVects[] is used by the pcre library to indicate where the
         * matched substrings are in the input string, where [1] points to the
         * very end of the total match.  we use this to iterate through the
         * readBuffer, always reset it after a read
         */
        substringVects[0] = 0;
        substringVects[1] = 0;

        /* parse as much of the input buffer as possible */
        while (substringVects[1] < readLength) {
#if YFDEBUG_APPRULES_PARSER
            g_string_printf(str,
                            "readLength %d, startPosition %d, initialText ",
                            readLength, substringVects[1]);
            for (loop = 0; loop < 10; loop++) {
                if (loop + substringVects[1] > readLength) {
                    break;
                }
                char curChar = *(lineBuffer + substringVects[1] + loop);
                if (isprint(curChar) && !iscntrl(curChar)) {
                    g_string_append_c(str, curChar);
                } else {
                    g_string_append_c(str, '.');
                }
            }
            g_debug("%s", str->str);
#endif /* if YFDEBUG_APPRULES_PARSER */

            /* get rid of CR's and LF's at the beginning, use the simple
             * manual method, they gum up the regex works */
            if ('\n' == *(lineBuffer + substringVects[1])
                || '\r' == *(lineBuffer + substringVects[1]))
            {
                do {
                    ++substringVects[1];
                } while ((substringVects[1] < readLength)
                         && ('\n' == *(lineBuffer + substringVects[1])
                             || '\r' == *(lineBuffer + substringVects[1])));
                continue;
            }

            /* first check for comments, and eliminate them */
            currentStartPos = substringVects[1];
            /* need to store the current offset, if we fail to match, we
             * get -1 in [1] */
            rc = pcre_exec(commentScanner, NULL, lineBuffer, readLength,
                           substringVects[1], PCRE_ANCHORED, substringVects,
                           NUM_SUBSTRING_VECTS);
            if (rc > 0) {
#if YFDEBUG_APPRULES_PARSER
                g_string_printf(str, "comment: start %d, end %d",
                                substringVects[0], substringVects[1]);
                pcre_get_substring(lineBuffer, substringVects, rc, 0,
                                   (const char **)&captString);
                g_string_append_printf(str, ", text \"%s\"", captString);
                pcre_free(captString);
                g_debug("%s", str->str);
#endif /* if YFDEBUG_APPRULES_PARSER */
                continue;
            }
            substringVects[1] = currentStartPos;

            /* scan the line to see if it is a regex statement, and get the
             * arguments if it is
             *
             * label APPLABEL regex REGEX */
            rc = pcre_exec(ruleScanner, NULL, lineBuffer, readLength,
                           substringVects[1], PCRE_ANCHORED, substringVects,
                           NUM_SUBSTRING_VECTS);
            if (rc > 0) {
                payloadScanRule_t *rule;
                pcre              *newRule;
                pcre_extra        *newExtra;

                rule = &ruleTable[numPayloadRules];

                /* get the first matched field from the regex rule expression
                 * (the label value) */
                pcre_get_substring(lineBuffer, substringVects, rc, 1,
                                   (const char **)&captString);
                rule->payloadLabelValue = strtoul(captString, NULL, 10);
#if YFDEBUG_APPRULES_PARSER
                g_string_printf(str, "regex: rule # %u, label value %lu ",
                                numPayloadRules,
                                strtoul(captString, NULL, 10));
#endif
                pcre_free(captString);

                /* get the second matched field from the regex rule expression
                 * (should be the regex); use pcre_compile() here since we are
                 * compiling the user's regex. */
                pcre_get_substring(lineBuffer, substringVects, rc, 2,
                                   (const char **)&captString);
#if YFDEBUG_APPRULES_PARSER
                g_string_append_printf(str, " regex \"%s\"", captString);
                g_debug("%s", str->str);
#endif
                newRule = pcre_compile(captString, 0, &errorString, &errorPos,
                                       NULL);
                if (NULL == newRule) {
                    eString = ycDisplayScannerRuleError(
                        "error in application regex",
                        errorString, captString, errorPos);
                    g_warning("Ignoring appLabel %u; %s",
                              rule->payloadLabelValue, eString->str);
                    g_string_free(eString, TRUE);
                } else {
                    newExtra = pcre_study(newRule, 0, &errorString);
                    rule->ruleArgs.regexFields.scannerExpression = newRule;
                    rule->ruleArgs.regexFields.scannerExtra = newExtra;
                    rule->ruleType = REGEX;
                    ycPortHashInsert(rule->payloadLabelValue, numPayloadRules);
                    numPayloadRules++;
                }
                pcre_free(captString);

                if (MAX_PAYLOAD_RULES == numPayloadRules) {
                    g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_LIMIT,
                                "maximum number of application labeler"
                                " rules has been reached");
#if YFDEBUG_APPRULES_PARSER
                    g_string_free(str, TRUE);
#endif
                    return FALSE;
                }

                continue;
            }
            substringVects[1] = currentStartPos;

            /* scan the line to see if it is a plugin statement, and handle
             * the arguments if it is
             *
             * label APPLABEL plugin PLUGIN_NAME FUNCTION [ARGS] */
            rc = pcre_exec(pluginScanner, NULL, lineBuffer, readLength,
                           substringVects[1], PCRE_ANCHORED, substringVects,
                           NUM_SUBSTRING_VECTS);
            if (rc > 0) {
                payloadScanRule_t  *rule;
                int                 numArgs;
                char              **argStrings;

                rule = &ruleTable[numPayloadRules];

                /* get the first matched field from the regex rule expression
                 * (the lable value) */
                pcre_get_substring(lineBuffer, substringVects, rc, 1,
                                   (const char **)&captString);
                rule->payloadLabelValue = strtoul(captString, NULL, 10);
#if YFDEBUG_APPRULES_PARSER
                g_string_printf(str, "plugin: rule # %u, label value %lu",
                                numPayloadRules, strtoul(captString, NULL, 10));
#endif
                pcre_free(captString);

                /*
                 * get the second matched field, which should be the plugin
                 * name and all of its arguments, now we need to chunk that
                 * into an array of strings, ala argc, argv
                 */
                pcre_get_substring(lineBuffer, substringVects, rc, 2,
                                   (const char **)&captString);
                argStrings = ycChunkString(wordScanner, captString, &numArgs);

                if (numArgs < 2) {
                    g_critical("Error: not enough arguments to load and call "
                               "a plugin, at least a library name and function"
                               " name are needed\n");
                    pcre_free(captString);
                    pcre_get_substring(lineBuffer, substringVects, rc, 0,
                                       (const char **)&captString);
                    g_critical("Input line: \"%s\"\n", captString);
                } else {
                    ENTRY       newItem;
                    ENTRY      *foundItem;
                    lt_dlhandle modHandle;
                    lt_ptr      funcPtr;

#if YFDEBUG_APPRULES_PARSER
                    g_string_append(str, ", plugin args:");
                    for (loop = 0; loop < numArgs; loop++) {
                        g_string_append_printf(str, " \"%s\"",
                                               argStrings[loop]);
                    }
                    g_debug("%s", str->str);
#endif /* if YFDEBUG_APPRULES_PARSER */

                    rule->ruleType = PLUGIN;
                    rule->ruleArgs.pluginArgs.numArgs = numArgs;
                    rule->ruleArgs.pluginArgs.pluginArgs = argStrings;
                    newItem.key = strdup(argStrings[0]);
                    if (NULL == newItem.key) {
                        g_error("out of memory error\n");
                        for (loop = 0; loop < numArgs; loop++) {
                            g_free((char *)(argStrings[loop]));
                        }
                        g_free(argStrings);
#if YFDEBUG_APPRULES_PARSER
                        g_string_free(str, TRUE);
#endif
                        return FALSE;
                    }
                    newItem.data = NULL;
                    foundItem = hsearch(newItem, FIND);
                    if (NULL == foundItem) {
                        modHandle = lt_dlopenext(newItem.key);
                        if (NULL == modHandle) {
                            g_critical("Could not open library \"%s\": %s",
                                       argStrings[0], lt_dlerror());
                            g_critical("Search path set to %s",
                                       lt_dlgetsearchpath());
                            g_critical("Set LTDL_LIBRARY_PATH to correct"
                                       " location.");
                            for (loop = 0; loop < numArgs; loop++) {
                                g_free((char *)(argStrings[loop]));
                            }
                            g_free(argStrings);
                            pcre_free(captString);
                            continue;
                        } else {
#if YFDEBUG_APPRULES_PARSER
                            const lt_dlinfo *info = lt_dlgetinfo(modHandle);
                            g_debug("Loading %s plugin from %s",
                                    info->name, info->filename);
#endif
                        }
                        newItem.data = (void *)modHandle;
                        hsearch(newItem, ENTER);
                    } else {
                        modHandle = (lt_dlhandle)foundItem->data;
                    }

                    funcPtr = lt_dlsym(modHandle, argStrings[1]);
                    if (NULL == funcPtr) {
                        g_critical("Could not find function \"%s\" in library"
                                   " \"%s\"\n", argStrings[1], argStrings[0]);
                        for (loop = 0; loop < numArgs; loop++) {
                            g_free((char *)(argStrings[loop]));
                        }
                        g_free(argStrings);
                        pcre_free(captString);
                        continue;
                    }
                    rule->ruleArgs.pluginArgs.handle = modHandle;
                    rule->ruleArgs.pluginArgs.func =
                        (ycScannerPlugin_fn)funcPtr;

                    ycPortHashInsert(rule->payloadLabelValue, numPayloadRules);
                    numPayloadRules++;
                }
                pcre_free(captString);

                if (MAX_PAYLOAD_RULES == numPayloadRules) {
                    g_warning("maximum number of rules has been reached\n");
#if YFDEBUG_APPRULES_PARSER
                    g_string_free(str, TRUE);
#endif
                    return TRUE;
                }
                continue;
            }
            substringVects[1] = currentStartPos;

            /* scan the line to see if it is a signature, and get the
             * arguments if it is
             *
             * label APPLABEL signature REGEX */
            rc = pcre_exec(signatureScanner, NULL, lineBuffer, readLength,
                           substringVects[1], PCRE_ANCHORED, substringVects,
                           NUM_SUBSTRING_VECTS);
            if (rc > 0) {
                pcre       *newRule;
                pcre_extra *newExtra;

                /* get the first matched field from the regex rule expression
                 * (the label value) */
                pcre_get_substring(lineBuffer, substringVects, rc, 1,
                                   (const char **)&captString);

                sigTable[numSigRules].payloadLabelValue =
                    strtoul(captString, NULL, 10);
#if YFDEBUG_APPRULES_PARSER
                g_string_printf(str, "signature: rule # %u, label value %lu",
                                numSigRules, strtoul(captString, NULL, 10));
#endif
                pcre_free(captString);

                /* get the second matched field from the regex rule expression
                 * (should be the regex); use pcre_compile() here since we are
                 * compiling the user's regex. */
                pcre_get_substring(lineBuffer, substringVects, rc, 2,
                                   (const char **)&captString);
#if YFDEBUG_APPRULES_PARSER
                g_string_append_printf(str, ", signature \"%s\"", captString);
                g_debug("%s", str->str);
#endif
                newRule = pcre_compile(captString, 0, &errorString, &errorPos,
                                       NULL);
                if (NULL == newRule) {
                    eString = ycDisplayScannerRuleError(
                        "error in signature regex",
                        errorString, captString, errorPos);
                    g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_IMPL,
                                "Ignoring signature %u: %s",
                                sigTable[numSigRules].payloadLabelValue,
                                eString->str);
                    g_string_free(eString, TRUE);
                } else {
                    newExtra = pcre_study(newRule, 0, &errorString);
                    sigTable[numSigRules].ruleArgs.regexFields.
                    scannerExpression = newRule;
                    sigTable[numSigRules].ruleArgs.regexFields.
                    scannerExtra = newExtra;
                    sigTable[numSigRules].ruleType = SIGNATURE;
                    numSigRules++;
                }

                pcre_free(captString);

                if (MAX_PAYLOAD_RULES == numSigRules) {
                    g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_LIMIT,
                                "maximum number of signature rules has "
                                "been reached");
#if YFDEBUG_APPRULES_PARSER
                    g_string_free(str, TRUE);
#endif
                    return FALSE;
                }
                continue;
            }
            substringVects[1] = currentStartPos;

            /*   pcre_free (captString);*/

            /*
             * check to see if we have partial text left over at the end of
             * the read buffer, if we copy it to the front of the read buffer,
             * and on the next read, read a little less to compensate for the
             * left over amount
             */
            if ((PCRE_ERROR_NOMATCH == rc) && (substringVects[1] < readLength)
                && !feof(scriptFile))
            {
                memmove(lineBuffer, lineBuffer + substringVects[1],
                        readLength - substringVects[1]);
                bufferOffset = readLength - substringVects[1];
                break;
            } else if (PCRE_ERROR_NOMATCH == rc && feof(scriptFile)) {
                /* this is an error, we have crap left over at the end of the
                 * file that we can't parse! */
                g_critical("Unparsed text at the end of the application labeler"
                           " rule file!\n");
                break;
            }
        }
    } while (!ferror(scriptFile) && !feof(scriptFile));

#if YFDEBUG_APPRULES_PARSER
    g_debug("\n============  FINISHED PARSING APPRULES FILE  ============\n");
    g_string_free(str, TRUE);
#endif

    /*
     * get rid of the module handle lookup hash; this creates a mem leak of
     * the module handles, they can't be freed any longer (although this is a
     * crappy hash, and iterating the hash is not possible....) */
    hdestroy();

    g_debug("Application Labeler accepted %d rules.", numPayloadRules);
    g_debug("Application Labeler accepted %d signatures.", numSigRules);
    pcre_free(ruleScanner);
    pcre_free(pluginScanner);
    pcre_free(commentScanner);
    pcre_free(signatureScanner);
    pcre_free(wordScanner);

    /* debug */
    return TRUE;
}


/**
 * scanPayload
 *
 * this iterates through all of the defined payload identifiers, as needed,
 * to determine what the payload type is.  It stops on the first match,
 *  so ordering does matter
 *
 * @param payloadData a pointer into the payload body
 * @param payloadSize the size of the payloadData in octects (aka bytes)
 *
 * @return a 16-bit int, usually mapped to a well known port, identifying
 *         the protocol, 0 if no match was found or any type of error occured
 *         during processing
 */
uint16_t
ycScanPayload(
    const uint8_t  *payloadData,
    unsigned int    payloadSize,
    yfFlow_t       *flow,
    yfFlowVal_t    *val)
{
#define NUM_CAPT_VECTS 18
#if YFDEBUG_APPLABEL
    GString     *gstr = g_string_sized_new(4096);
#endif
    int          loop = 0;
    int          sport = MAX_PAYLOAD_RULES + 1;
    int          dport = MAX_PAYLOAD_RULES + 1;
    int          rc = 0;
    int          captVects[NUM_CAPT_VECTS];
    const payloadScanRule_t *rule;

#if 0 && YFDEBUG_APPLABEL
    g_string_printf(gstr, "%s payload (paylen = %u)",
                    ((val == &flow->val) ? "fwd" : "rev"), payloadSize);
    ycPayloadPrinter(payloadData, payloadSize, 500, gstr);
#endif  /* 0 && YFDEBUG_APPLABEL */

    /* first check the signature table to see if any signatures should
    * be executed first  - check both directions and only check once*/
    if (numSigRules > 0 && (val == &(flow->val))) {
        for (loop = 0, rule = sigTable;
             loop < (int)numSigRules;
             ++loop, ++rule)
        {
            rc = pcre_exec(rule->ruleArgs.regexFields.scannerExpression,
                           rule->ruleArgs.regexFields.scannerExtra,
                           (char *)payloadData, payloadSize,
                           0, 0, captVects, NUM_CAPT_VECTS);
            if (rc > 0) {
                /* Found a signature match */
#if YFDEBUG_APPLABEL
                g_string_free(gstr, TRUE);
#endif
                return rule->payloadLabelValue;
            }
            if (flow->rval.paylen) {
                rc = pcre_exec(rule->ruleArgs.regexFields.scannerExpression,
                               rule->ruleArgs.regexFields.scannerExtra,
                               (char *)flow->rval.payload, flow->rval.paylen,
                               0, 0, captVects, NUM_CAPT_VECTS);
                if (rc > 0) {
                    /* Found a signature match on reverse direction */
#if YFDEBUG_APPLABEL
                    g_string_free(gstr, TRUE);
#endif
                    return rule->payloadLabelValue;
                }
            }
        }
    }

    /*
     *  First check for matches based on the source port (loop == -2) or the
     *  destination port (loop == -1); if that fails, check all rules in
     *  definition order, skipping a rule if it was previously checked because
     *  it matched the sport or dport.
     */
    for (loop = -2; loop < (int)numPayloadRules; ++loop) {
        if (loop >= 0) {
            if (sport == loop || dport == loop) {
                /* already checked */
                continue;
            }
            rule = &ruleTable[loop];
        } else if (-2 == loop) {
            /* ycPortHashSearch() returns MAX_PAYLOAD+RULES+1 when the port is
             * not found */
            if ((sport = ycPortHashSearch(flow->key.sp)) > MAX_PAYLOAD_RULES) {
                continue;
            }
            rule = &ruleTable[sport];
        } else if (-1 == loop) {
            if ((dport = ycPortHashSearch(flow->key.dp)) > MAX_PAYLOAD_RULES) {
                continue;
            }
            rule = &ruleTable[dport];
        }

        if (REGEX == rule->ruleType) {
            rc = pcre_exec(rule->ruleArgs.regexFields.scannerExpression,
                           rule->ruleArgs.regexFields.scannerExtra,
                           (char *)payloadData, payloadSize,
                           0, 0, captVects, NUM_CAPT_VECTS);
            if (rc > 0) {
#if YFDEBUG_APPLABEL
                g_string_printf(
                    gstr, "protocol match (%u, %u, regex [%d, %d])",
                    rule->payloadLabelValue, rc, captVects[0], captVects[1]);
                ycPayloadPrinter(payloadData, payloadSize, 20, gstr);
                g_string_free(gstr, TRUE);
#endif
                return rule->payloadLabelValue;
            }
        } else if (PLUGIN == rule->ruleType) {
            rc = (rule->ruleArgs.pluginArgs.func(
                      rule->ruleArgs.pluginArgs.numArgs,
                      rule->ruleArgs.pluginArgs.pluginArgs,
                      payloadData, payloadSize, flow, val));
            if (rc > 0) {
#if YFDEBUG_APPLABEL
                g_string_printf(gstr, "protocol match (%u, %u):",
                                rule->payloadLabelValue, rc);
                ycPayloadPrinter(payloadData, payloadSize, 20, gstr);
                g_string_free(gstr, TRUE);
#endif
                /* If plugin returns 1 -
                 * return whatever value is in the conf file */
                /* Plugins can identify more than 1 type of protocol */
                return ((rc == 1) ? rule->payloadLabelValue : rc);
            }
        }
    }

#if YFDEBUG_APPLABEL
    if (NULL != payloadData) {
        g_string_assign(gstr, "non-matching payload data is");
        ycPayloadPrinter(payloadData, payloadSize, 40, gstr);
    } else {
        g_debug("no payload present");
    }
    g_string_free(gstr, TRUE);
#endif /* if YFDEBUG_APPLABEL */

    return 0;
}


/**
 * ycGetRuleType
 *
 *
 */
int
ycGetRuleType(
    uint16_t   port)
{
    int index;

    index = ycPortHashSearch(port);
    if (index != (MAX_PAYLOAD_RULES + 1)) {
        return ruleTable[index].ruleType;
    } else {
        return EMPTY;
    }
}


#if YFDEBUG_APPLABEL
/**
 * ycPayloadPrinter
 *
 * this is used for debug purposes to print out the start of the payload data,
 * useful in checking if the app labeler is getting anything correct when
 * adding
 * new protocols
 *
 * @param payloadData a pointer to the payload array
 * @param payloadSize the size of the payloadData array
 * @param numPrint amount of the payload data to print
 * @param prefixString string to add to the front of the payload dump
 *
 */
static void
ycPayloadPrinter(
    const uint8_t *payloadData,
    unsigned int   payloadSize,
    unsigned int   numPrint,
    GString       *gstr)
{
    g_string_append(gstr, ": \"");

    if (NULL != payloadData) {
        numPrint = MIN(numPrint, payloadSize);
        while  (numPrint > 0) {
            if (isprint(*payloadData) && !iscntrl(*payloadData)) {
                g_string_append_c(gstr, *payloadData);
            } else {
                g_string_append_c(gstr, '.');
            }
            ++payloadData;
            --numPrint;
        }
    }
    g_string_append_c(gstr, '"');

    g_debug("%s", gstr->str);
}
#endif /* if YFDEBUG_APPLABEL */


/*
 *    Compiles `regex` with `options` and sets `compiled` to the result.
 *
 *    Function called by the macro ycPcreCompile().
 */
gboolean
ycPcreCompile2(
    pcre       **compiled,
    const char  *regex_var_name,
    const char  *regex,
    int          options,
    GError     **err)
{
    const char *err_string;
    int         err_offset;

    *compiled = pcre_compile(regex, options, &err_string, &err_offset, NULL);
    if (NULL == *compiled) {
        g_set_error(err, YAF_ERROR_DOMAIN, YAF_ERROR_INTERNAL,
                    ("Error compiling regex %s:"
                     "\n  error:  %s at offset %d"
                     "\n  regex:  %s"
                     "\n  offset: %*s^--HERE"),
                    regex_var_name, err_string, err_offset,
                    regex, err_offset, "");
        return FALSE;
    }
    return TRUE;
}


/**
 * ycDisplayScannerRuleError
 *
 * displays an error line to the user when a scanner rule (used for the built
 * in rules too) doesn't compile
 * using the PCRE lirbary
 *
 * @param eString the string array to put the formatted error string,
 *        memory allocated by caller
 * @param size the length of the eString
 * @param descrip a brief description prefixed before the error output
 * @param errorMsg the error message returned from the PCRE library
 * @param regex the regular expression passed into PCRE compile
 * @param errorPos the position where the expression failed (returned from
 * pcre_compile)
 *
 */
static GString *
ycDisplayScannerRuleError(
    const char    *descrip,
    const char    *errorMsg,
    const char    *regex,
    int            errorPos)
{
    GString     *eString = g_string_sized_new(512);

    g_string_printf(eString, "%s\n   \tmessage:  %s at position %d\n",
                    descrip, errorMsg, errorPos);
    g_string_append_printf(eString, "\tregex:    %s\n", regex);
    g_string_append_printf(eString, "\tposition: %*s^--HERE\n",
                           errorPos, "");

    return eString;
}


/**
 * ycChunkString
 *
 *  Turns a single string buffer (char *) into a set of seperate words to
 *  convert an argument list as a single parameter to return something that
 *  looks like the standard C argc, argv pair.
 *
 *  @param  wordScanner the regex used to split the string
 *  @param  sampleString the input string, with multiple words as a single
 *          input buffer
 *  @param  argNum on output the number of arguments split out from
 *          sampleString; 0 on error or no arguments
 *  @return an array of strings allocated dynamically here and
 *          returned that contains each space seperated word in the
 *          sampleString
 *
 */
static char **
ycChunkString(
    pcre        *wordScanner,
    const char  *sampleString,
    int         *argNum)
{
    char  **argStrings;
    int     substringVects[NUM_SUBSTRING_VECTS];
    char   *captString;
    int     rc;
    int     loop;

    /* FIXME: Replace with g_strsplit_set() */

    /*
     * first step: find all of the strings, and count how many of them there
     * are (then we can allocate memory) for each of them in a second pass
     * :( */
    substringVects[0] = 0;
    substringVects[1] = 0;
    *argNum = 0;
    do {
        rc = pcre_exec(wordScanner, NULL, sampleString, strlen(sampleString),
                       substringVects[1], 0, substringVects,
                       NUM_SUBSTRING_VECTS);
        if (rc > 0) {
            (*argNum)++;
        }
    } while (rc > 0);

    /* allocate an array of char[] pointers (char **) */
    argStrings = g_new(char *, *argNum);

    /* now that we have memory to store all the strings, find them all
     * (again) */
    substringVects[0] = 0;
    substringVects[1] = 0;
    for (loop = 0; loop < *argNum; loop++) {
        rc = pcre_exec(wordScanner, NULL, sampleString, strlen(sampleString),
                       substringVects[1], 0, substringVects,
                       NUM_SUBSTRING_VECTS);
        pcre_get_substring(sampleString, substringVects, rc, 0,
                           (const char **)&captString);
        argStrings[loop] = g_strdup(captString);
        pcre_free(captString);
    }

    return argStrings;
}


/**
 * ycDnsScanRebuildHeader
 *
 * This function handles the endianess of the received message and
 * deals with machine alignment issues by not mapping a network
 * octect stream directly into the DNS structure
 *
 * @param payload a network stream capture
 * @param header a pointer to a client allocated dns message
 *        header structure
 *
 *
 */
void
ycDnsScanRebuildHeader(
    const uint8_t             *payload,
    ycDnsScanMessageHeader_t  *header)
{
    uint16_t    *tempArray = (uint16_t *)header;
    uint16_t     bitmasks = ntohs(*((uint16_t *)(payload + 2)));
    unsigned int loop;

    memcpy(tempArray, payload, sizeof(ycDnsScanMessageHeader_t));
    for (loop = 0; loop < sizeof(ycDnsScanMessageHeader_t) / sizeof(uint16_t);
         loop++)
    {
        *(tempArray + loop) = ntohs(*(tempArray + loop));
    }

    header->qr = bitmasks & 0x8000 ? 1 : 0;
    header->opcode = (bitmasks & 0x7800) >> 11;
    header->aa = bitmasks & 0x0400 ? 1 : 0;
    header->tc = bitmasks & 0x0200 ? 1 : 0;
    header->rd = bitmasks & 0x0100 ? 1 : 0;
    header->ra = bitmasks & 0x0080 ? 1 : 0;
    header->z = bitmasks & 0x0040 ? 1 : 0;
    /* don't think we care about these
     * header->ad = bitmasks & 0x0020 ? 1 : 0;
     * header->cd = bitmasks & 0x0010 ? 1 : 0; */
    header->rcode = bitmasks & 0x000f;
/*
 *  g_debug("header->qr %d", header->qr);
 *  g_debug("header->opcode %d", header->opcode);
 *  g_debug("header->aa %d", header->aa);
 *  g_debug("header->tc %d", header->tc);
 *  g_debug("header->rd %d", header->rd);
 *  g_debug("header->ra %d", header->ra);
 *  g_debug("header->z %d", header->z);
 *  g_debug("header->rcode %d", header->rcode);
 */
}


/**
 *
 * yfRemoveCRC
 *
 *
 * This function removes the Cyclic Redundancy Check codes
 * from a payload, in order to do DPI.
 *
 * @param start start of payload that contains CRCs
 * @param length length of payload that contains CRCs
 * @param dst destination buffer to copy payload without CRCs
 * @param dst_length length of destination buffer
 * @param block_size size of blocks of data
 * @param crc_length size of crc codes
 *
 *
 */
void
yfRemoveCRC(
    const uint8_t  *start,
    size_t          length,
    uint8_t        *dst,
    size_t         *dst_length,
    int             block_size,
    int             crc_length)
{
    uint16_t offset = 0;
    size_t   curlen = 0;

    while ((length > ((size_t)block_size + crc_length)) &&
           (curlen + block_size < *dst_length))
    {
        memcpy((dst + curlen), start + offset, block_size);
        curlen += block_size;
        offset += block_size + crc_length;
        length -= block_size + crc_length;
    }

    if ((length > (size_t)crc_length) && (curlen + length < *dst_length)) {
        memcpy((dst + curlen), (start + offset), (length - crc_length));
        curlen += length - crc_length;
        offset += length;
    }

    *dst_length = curlen;
}


#endif /* if YAF_ENABLE_APPLABEL */
