initial github commit

Implementation of backprop in C using Grand Central Dispatch and Blocks
This commit is contained in:
James Griffin
2014-08-06 15:12:09 -03:00
commit 6f23634e32
6 changed files with 1926 additions and 0 deletions

121
process.h Normal file
View File

@@ -0,0 +1,121 @@
#ifndef PROCESS
#define PROCESS
#define PRO 1
#define CON 0
#define PRO_CON_OUTPUT 2
#define UNKNOWN 2
#define VECTOR_SIZE 1000
#define MAX_TERM_LENGTH 128
/*
A struct that contains a text message and a PRO or CON classification
*/
typedef struct message {
char * text;
double text_vector[VECTOR_SIZE];
int class;
int prediction;
double prediction_probability[PRO_CON_OUTPUT];
} message;
/*
A struct that contains all of the messages used for training for testing
*/
typedef struct data {
message ** instances;
char ** vector_terms;
int vector_document_counts[VECTOR_SIZE];
int count;
} data;
/*
A function that will free all the memory allocated for a data struct
@param to_free The data struct that should be free'd
@return 0 if it is free'd successfully
*/
int free_data(data * to_free);
/*
A function that takes a file name return a data struct that contains the messages
(and classifications if provided)
@param file: The name of a file of data to be read into a data structure.
@return A pointer to a struct containing an array of message structs and their
classifications
*/
data * read_data(char * file);
/*
Output data into a weka format
@param print_data: The data struct to be printed
@param out_file: The file where the weka arff should be written
@return 0 if successfully output
*/
int weka_output(data * print_data, char * out_file);
/*
Output data into a csv file with 1 instance per line
@param print_data: The data struct to be printed
@param out_file: The file where the weka arff should be written
@return 0 if successfully output
*/
int csv_output(data * print_data, char * out_file);
/*
A function for escaping single quotes in a string.
Based off generic code found http://creativeandcritical.net/str-replace-c/
Modified to only escapse ''s
@param str The string to escape
@return An escaped string.
*/
char * escape_single_quote(const char *str);
/*
A function that reads in a collection of stop words from a file with 1 word per line.
@param filename The name of the file to be parsed
@param word_count A pointer where the number of stop words should be stored
@return the array of words
*/
char ** load_stop_words(char * filename, int * word_count);
/*
A function that takes a dataset, a percentage that should be reserved for testing.
The function requires 2 data pointers for storing the resulting train and test sets
@param dataset The data to be split
@param percent The percent of the data to be used for TESTING
@param train The data that will be used for training
@param test The data that will be used for testing
@return 0 if the new datasets are created without issue. non zero otherwise.
*/
int train_test_split(const data * dataset, const int percent, data * train, data * test);
/*
A function that parses through all the supplied messages in a data struct
and determines the most relevant terms in the training data. These will be
used in the construction of a vector representation of any specfic message
@param dataset The collection of messages to be considered
@param stop_words an array of stopwords to ignore
@param stop_word_count the number of stop words.
@return 0 if the vector representation is found and the vectors created for
all messages
*/
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size);
/*
A simple function that compares word counts based on an array of words
and a second array of the counts.
@param a word 1
@param b word 2
@return -1, 0, 1 if a is less than, equal to, or greater than b
*/
int compare_strings(const void * a, const void * b);
/*
A function that parses through all the supplied messages in a data struct
and determines the vector representation based on the supplied vector terms
@param dataset The collection of messages to be considered
@param vector_terms The array of terms that make up the vector
@return 0 if the vectors created for all messages
*/
int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size);
#endif