122 lines
4.0 KiB
C
122 lines
4.0 KiB
C
#ifndef PROCESS
|
|
#define PROCESS
|
|
|
|
#define PRO 1
|
|
#define CON 0
|
|
#define PRO_CON_OUTPUT 2
|
|
#define UNKNOWN 2
|
|
#define VECTOR_SIZE 1000
|
|
#define MAX_TERM_LENGTH 128
|
|
|
|
/*
|
|
A struct that contains a text message and a PRO or CON classification
|
|
*/
|
|
typedef struct message {
|
|
char * text;
|
|
double text_vector[VECTOR_SIZE];
|
|
int class;
|
|
int prediction;
|
|
double prediction_probability[PRO_CON_OUTPUT];
|
|
} message;
|
|
|
|
/*
|
|
A struct that contains all of the messages used for training for testing
|
|
*/
|
|
typedef struct data {
|
|
message ** instances;
|
|
char ** vector_terms;
|
|
int vector_document_counts[VECTOR_SIZE];
|
|
int count;
|
|
} data;
|
|
|
|
/*
|
|
A function that will free all the memory allocated for a data struct
|
|
@param to_free The data struct that should be free'd
|
|
@return 0 if it is free'd successfully
|
|
*/
|
|
int free_data(data * to_free);
|
|
|
|
/*
|
|
A function that takes a file name return a data struct that contains the messages
|
|
(and classifications if provided)
|
|
@param file: The name of a file of data to be read into a data structure.
|
|
@return A pointer to a struct containing an array of message structs and their
|
|
classifications
|
|
*/
|
|
data * read_data(char * file);
|
|
|
|
/*
|
|
Output data into a weka format
|
|
@param print_data: The data struct to be printed
|
|
@param out_file: The file where the weka arff should be written
|
|
@return 0 if successfully output
|
|
*/
|
|
int weka_output(data * print_data, char * out_file);
|
|
|
|
/*
|
|
Output data into a csv file with 1 instance per line
|
|
@param print_data: The data struct to be printed
|
|
@param out_file: The file where the weka arff should be written
|
|
@return 0 if successfully output
|
|
*/
|
|
int csv_output(data * print_data, char * out_file);
|
|
/*
|
|
A function for escaping single quotes in a string.
|
|
Based off generic code found http://creativeandcritical.net/str-replace-c/
|
|
Modified to only escapse ''s
|
|
@param str The string to escape
|
|
@return An escaped string.
|
|
*/
|
|
char * escape_single_quote(const char *str);
|
|
|
|
/*
|
|
A function that reads in a collection of stop words from a file with 1 word per line.
|
|
@param filename The name of the file to be parsed
|
|
@param word_count A pointer where the number of stop words should be stored
|
|
@return the array of words
|
|
*/
|
|
char ** load_stop_words(char * filename, int * word_count);
|
|
|
|
/*
|
|
A function that takes a dataset, a percentage that should be reserved for testing.
|
|
The function requires 2 data pointers for storing the resulting train and test sets
|
|
@param dataset The data to be split
|
|
@param percent The percent of the data to be used for TESTING
|
|
@param train The data that will be used for training
|
|
@param test The data that will be used for testing
|
|
@return 0 if the new datasets are created without issue. non zero otherwise.
|
|
*/
|
|
int train_test_split(const data * dataset, const int percent, data * train, data * test);
|
|
|
|
/*
|
|
A function that parses through all the supplied messages in a data struct
|
|
and determines the most relevant terms in the training data. These will be
|
|
used in the construction of a vector representation of any specfic message
|
|
@param dataset The collection of messages to be considered
|
|
@param stop_words an array of stopwords to ignore
|
|
@param stop_word_count the number of stop words.
|
|
@return 0 if the vector representation is found and the vectors created for
|
|
all messages
|
|
*/
|
|
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size);
|
|
|
|
/*
|
|
A simple function that compares word counts based on an array of words
|
|
and a second array of the counts.
|
|
@param a word 1
|
|
@param b word 2
|
|
@return -1, 0, 1 if a is less than, equal to, or greater than b
|
|
*/
|
|
int compare_strings(const void * a, const void * b);
|
|
|
|
/*
|
|
A function that parses through all the supplied messages in a data struct
|
|
and determines the vector representation based on the supplied vector terms
|
|
@param dataset The collection of messages to be considered
|
|
@param vector_terms The array of terms that make up the vector
|
|
@return 0 if the vectors created for all messages
|
|
*/
|
|
int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size);
|
|
|
|
#endif
|