#ifndef PROCESS #define PROCESS #define PRO 1 #define CON 0 #define PRO_CON_OUTPUT 2 #define UNKNOWN 2 #define VECTOR_SIZE 1000 #define MAX_TERM_LENGTH 128 /* A struct that contains a text message and a PRO or CON classification */ typedef struct message { char * text; double text_vector[VECTOR_SIZE]; int class; int prediction; double prediction_probability[PRO_CON_OUTPUT]; } message; /* A struct that contains all of the messages used for training for testing */ typedef struct data { message ** instances; char ** vector_terms; int vector_document_counts[VECTOR_SIZE]; int count; } data; /* A function that will free all the memory allocated for a data struct @param to_free The data struct that should be free'd @return 0 if it is free'd successfully */ int free_data(data * to_free); /* A function that takes a file name return a data struct that contains the messages (and classifications if provided) @param file: The name of a file of data to be read into a data structure. @return A pointer to a struct containing an array of message structs and their classifications */ data * read_data(char * file); /* Output data into a weka format @param print_data: The data struct to be printed @param out_file: The file where the weka arff should be written @return 0 if successfully output */ int weka_output(data * print_data, char * out_file); /* Output data into a csv file with 1 instance per line @param print_data: The data struct to be printed @param out_file: The file where the weka arff should be written @return 0 if successfully output */ int csv_output(data * print_data, char * out_file); /* A function for escaping single quotes in a string. Based off generic code found http://creativeandcritical.net/str-replace-c/ Modified to only escapse ''s @param str The string to escape @return An escaped string. */ char * escape_single_quote(const char *str); /* A function that reads in a collection of stop words from a file with 1 word per line. @param filename The name of the file to be parsed @param word_count A pointer where the number of stop words should be stored @return the array of words */ char ** load_stop_words(char * filename, int * word_count); /* A function that takes a dataset, a percentage that should be reserved for testing. The function requires 2 data pointers for storing the resulting train and test sets @param dataset The data to be split @param percent The percent of the data to be used for TESTING @param train The data that will be used for training @param test The data that will be used for testing @return 0 if the new datasets are created without issue. non zero otherwise. */ int train_test_split(const data * dataset, const int percent, data * train, data * test); /* A function that parses through all the supplied messages in a data struct and determines the most relevant terms in the training data. These will be used in the construction of a vector representation of any specfic message @param dataset The collection of messages to be considered @param stop_words an array of stopwords to ignore @param stop_word_count the number of stop words. @return 0 if the vector representation is found and the vectors created for all messages */ int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size); /* A simple function that compares word counts based on an array of words and a second array of the counts. @param a word 1 @param b word 2 @return -1, 0, 1 if a is less than, equal to, or greater than b */ int compare_strings(const void * a, const void * b); /* A function that parses through all the supplied messages in a data struct and determines the vector representation based on the supplied vector terms @param dataset The collection of messages to be considered @param vector_terms The array of terms that make up the vector @return 0 if the vectors created for all messages */ int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size); #endif