initial github commit

Implementation of backprop in C using Grand Central Dispatch and Blocks
2014-08-06 15:12:09 -03:00
commit 6f23634e32
6 changed files with 1926 additions and 0 deletions
--- a/process.h
+++ b/process.h
@@ -0,0 +1,121 @@
+#ifndef PROCESS
+#define PROCESS
+
+#define PRO 1
+#define CON 0
+#define PRO_CON_OUTPUT 2
+#define UNKNOWN 2
+#define VECTOR_SIZE 1000
+#define MAX_TERM_LENGTH 128
+
+/*
+	A struct that contains a text message and a PRO or CON classification
+*/
+typedef struct message {
+	char * text;
+	double text_vector[VECTOR_SIZE];
+	int class;
+	int prediction;
+	double prediction_probability[PRO_CON_OUTPUT];
+} message;
+
+/*
+	A struct that contains all of the messages used for training for testing
+*/
+typedef struct data {
+	message ** instances;
+	char ** vector_terms;
+	int vector_document_counts[VECTOR_SIZE];
+	int count;
+} data;
+
+/*
+	A function that will free all the memory allocated for a data struct
+	@param to_free The data struct that should be free'd
+	@return 0 if it is free'd successfully
+*/
+int free_data(data * to_free);
+
+/*
+	A function that takes a file name return a  data struct that contains the messages
+	(and classifications if provided)
+	@param file: The name of a file of data to be read into a data structure.
+	@return A pointer to a struct containing an array of message structs and their 
+		classifications
+*/
+data * read_data(char * file);
+
+/*
+	Output data into a weka format
+	@param print_data: The data struct to be printed
+	@param out_file: The file where the weka arff should be written
+	@return 0 if successfully output
+*/
+int weka_output(data * print_data, char * out_file);
+
+/*
+	Output data into a csv file with 1 instance per line
+	@param print_data: The data struct to be printed
+	@param out_file: The file where the weka arff should be written
+	@return 0 if successfully output
+*/
+int csv_output(data * print_data, char * out_file);
+/*
+	A function for escaping single quotes in a string.
+	Based off generic code found http://creativeandcritical.net/str-replace-c/
+	Modified to only escapse ''s
+	@param str The string to escape
+	@return An escaped string.
+*/
+char * escape_single_quote(const char *str);
+
+/*
+	A function that reads in a collection of stop words from a file with 1 word per line.
+	@param filename The name of the file to be parsed
+	@param word_count A pointer where the number of stop words should be stored
+	@return the array of words
+*/
+char ** load_stop_words(char * filename, int * word_count);
+
+/*
+	A function that takes a dataset, a percentage that should be reserved for testing.
+	The function requires 2 data pointers for storing the resulting train and test sets
+	@param dataset The data to be split
+	@param percent The percent of the data to be used for TESTING
+	@param train The data that will be used for training
+	@param test The data that will be used for testing
+	@return 0 if the new datasets are created without issue. non zero otherwise.
+*/
+int train_test_split(const data * dataset, const int percent, data * train, data * test);
+
+/*
+	A function that parses through all the supplied messages in a data struct
+	and determines the most relevant terms in the training data. These will be
+	used in the construction of a vector representation of any specfic message
+	@param dataset The collection of messages to be considered
+	@param stop_words an array of stopwords to ignore
+	@param stop_word_count the number of stop words.
+	@return 0 if the vector representation is found and the vectors created for
+	 	all messages
+*/
+int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size);
+
+/*
+	A simple function that compares word counts based on an array of words
+	and a second array of the counts.
+	@param a word 1
+	@param b word 2
+	@return -1, 0, 1 if a is less than, equal to, or greater than b
+*/
+int compare_strings(const void * a, const void * b);
+
+/*
+	A function that parses through all the supplied messages in a data struct
+	and determines the vector representation based on the supplied vector terms
+	@param dataset The collection of messages to be considered
+	@param vector_terms The array of terms that make up the vector
+	@return 0 if the vectors created for all messages
+*/
+int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size);
+
+#endif