backprop/process.h

#ifndef PROCESS
#define PROCESS

#define PRO 1
#define CON 0
#define PRO_CON_OUTPUT 2
#define UNKNOWN 2
#define VECTOR_SIZE 1000
#define MAX_TERM_LENGTH 128

/*
	A struct that contains a text message and a PRO or CON classification
*/
typedef struct message {
	char * text;
	double text_vector[VECTOR_SIZE];
	int class;
	int prediction;
	double prediction_probability[PRO_CON_OUTPUT];
} message;

/*
	A struct that contains all of the messages used for training for testing
*/
typedef struct data {
	message ** instances;
	char ** vector_terms;
	int vector_document_counts[VECTOR_SIZE];
	int count;
} data;

/*
	A function that will free all the memory allocated for a data struct
	@param to_free The data struct that should be free'd
	@return 0 if it is free'd successfully
*/
int free_data(data * to_free);

/*
	A function that takes a file name return a  data struct that contains the messages
	(and classifications if provided)
	@param file: The name of a file of data to be read into a data structure.
	@return A pointer to a struct containing an array of message structs and their
		classifications
*/
data * read_data(char * file);

/*
	Output data into a weka format
	@param print_data: The data struct to be printed
	@param out_file: The file where the weka arff should be written
	@return 0 if successfully output
*/
int weka_output(data * print_data, char * out_file);

/*
	Output data into a csv file with 1 instance per line
	@param print_data: The data struct to be printed
	@param out_file: The file where the weka arff should be written
	@return 0 if successfully output
*/
int csv_output(data * print_data, char * out_file);
/*
	A function for escaping single quotes in a string.
	Based off generic code found http://creativeandcritical.net/str-replace-c/
	Modified to only escapse ''s
	@param str The string to escape
	@return An escaped string.
*/
char * escape_single_quote(const char *str);

/*
	A function that reads in a collection of stop words from a file with 1 word per line.
	@param filename The name of the file to be parsed
	@param word_count A pointer where the number of stop words should be stored
	@return the array of words
*/
char ** load_stop_words(char * filename, int * word_count);

/*
	A function that takes a dataset, a percentage that should be reserved for testing.
	The function requires 2 data pointers for storing the resulting train and test sets
	@param dataset The data to be split
	@param percent The percent of the data to be used for TESTING
	@param train The data that will be used for training
	@param test The data that will be used for testing
	@return 0 if the new datasets are created without issue. non zero otherwise.
*/
int train_test_split(const data * dataset, const int percent, data * train, data * test);

/*
	A function that parses through all the supplied messages in a data struct
	and determines the most relevant terms in the training data. These will be
	used in the construction of a vector representation of any specfic message
	@param dataset The collection of messages to be considered
	@param stop_words an array of stopwords to ignore
	@param stop_word_count the number of stop words.
	@return 0 if the vector representation is found and the vectors created for
	 	all messages
*/
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size);

/*
	A simple function that compares word counts based on an array of words
	and a second array of the counts.
	@param a word 1
	@param b word 2
	@return -1, 0, 1 if a is less than, equal to, or greater than b
*/
int compare_strings(const void * a, const void * b);

/*
	A function that parses through all the supplied messages in a data struct
	and determines the vector representation based on the supplied vector terms
	@param dataset The collection of messages to be considered
	@param vector_terms The array of terms that make up the vector
	@return 0 if the vectors created for all messages
*/
int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size);

#endif