From 6f23634e3222274ad487f16f2cc0d44e3022b725 Mon Sep 17 00:00:00 2001 From: James Griffin Date: Wed, 6 Aug 2014 15:12:09 -0300 Subject: [PATCH] initial github commit Implementation of backprop in C using Grand Central Dispatch and Blocks --- Makefile | 16 ++ main.c | 358 ++++++++++++++++++++++++++ nn.c | 753 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ nn.h | 171 +++++++++++++ process.c | 507 ++++++++++++++++++++++++++++++++++++ process.h | 121 +++++++++ 6 files changed, 1926 insertions(+) create mode 100644 Makefile create mode 100644 main.c create mode 100644 nn.c create mode 100644 nn.h create mode 100644 process.c create mode 100644 process.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..78896bc --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +CC=clang +CFLAGS= -Wall + +default: main.o process.o nn.o + $(CC) process.o nn.o main.o -o procon + +main.o: main.c + $(CC) $(CFLAGS) -c main.c + +process.o: process.c + $(CC) $(CFLAGS) -c process.c + +nn.o: nn.c + $(CC) $(CFLAGS) -c nn.c +clean: + rm *.o procon \ No newline at end of file diff --git a/main.c b/main.c new file mode 100644 index 0000000..70b2448 --- /dev/null +++ b/main.c @@ -0,0 +1,358 @@ +/* + Author: James Griffin-Allwood + Date: March 4 2014 + + Description: +*/ + +#include +#include +#include +#include +#include "process.h" +#include "nn.h" + +#define MAX_PARAM_SIZE 255 +#define MAX_COMMAND_SIZE 10 +#define DEFAULT_TRAIN_TEST_SPLIT 25 + +/* + A function that prints usage information for this application +*/ +void usage(char * app); + +/* + A function that prints command information +*/ +void print_help_text(); + +int main(int argc, char * argv[]) { + char * train; + char * classify; + char * stop; + char ** stop_word_array; + + int prompt = 0, train_file = 0, classify_file = 0, stop_words = 0; + int stop_word_count = 0; + int train_test_split_percent = DEFAULT_TRAIN_TEST_SPLIT; + int network_layers = DEFAULT_LAYERS; + int input_vector_size = VECTOR_SIZE; + double learning_rate = DEFAULT_LEARNING_RATE; + double desired_test_error_min = DEFAULT_TEST_ERROR_MIN; + int epochs = DEFAULT_MAX_EPOCHS_SINCE_MIN; + int hidden_nodes_per_layer = DEFAULT_HIDDEN_NODES_PER_LAYER; + data * to_train = NULL, * to_classify = NULL, * train_set = NULL, * test_set = NULL; + nn_model * classifier = NULL; + + // Process cli arguments and determine if there were flags. + // If no flags and just file names load data and write weka files + if (argc == 1) { + prompt = 1; + } else if (argc > 1) { + if (argv[1][0] == '-') { + int params = strlen(argv[1]); + prompt = 1; + + for (int i = 1; i < params; i++) { + switch (argv[1][i]) { + case 't': + train_file = 1; + break; + case 'c': + classify_file = 1; + break; + case 's': + stop_words = 1; + break; + default: + break; + } + } + } else { + if (argc == 3) { + classify = argv[2]; + to_classify = read_data(classify); + if (weka_output(to_classify, "procon_classify.arff") != 0) { + fprintf(stderr, "Unable to write weka_formatted file procon_classify.arff\n"); + } + free_data(to_classify); + } + train = argv[1]; + to_train = read_data(train); + if (weka_output(to_train, "procon_train.arff") != 0) { + fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); + } + free_data(to_train); + + return EXIT_SUCCESS; + } + } + + if ((train_set = calloc(1, sizeof(data))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + + if ((test_set = calloc(1, sizeof(data))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + + // If flags are set for commandline training or test data get file names and read data + if (train_file) { + if (!classify_file) { + train = argv[argc - 1]; + } else { + train = argv[argc - 2]; + } + to_train = read_data(train); + fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count); + } + + if (classify_file) { + classify = argv[argc - 1]; + to_classify = read_data(classify); + fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count); + } + + if (stop_words) { + if (!classify_file) { + if (!train_file) { + stop = argv[argc - 1]; + } else { + stop = argv[argc - 2]; + } + } else { + if (!train_file) { + stop = argv[argc - 2]; + } else { + stop = argv[argc - 3]; + } + } + + stop_word_array = load_stop_words(stop, &stop_word_count); + fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count); + } + + /* + Begin terminal interface. Exit on "exit". This interface allows you to load, + specify learning parameters, learn, classify, write output data. + */ + fprintf(stdout, "Pro/Con Learner.\nEnter commands below. (type 'help' for commands)\n"); + char * command, * p1, * p2; + + if ((command = malloc(sizeof(char) * MAX_COMMAND_SIZE)) == NULL) { + fprintf(stderr, "Unable to allocate memory for command"); + } + + if ((p1 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) { + fprintf(stderr, "Unable to allocate memory for parameters"); + } + + if ((p2 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) { + fprintf(stderr, "Unable to allocate memory for parameters"); + } + + while (prompt) { + fprintf(stdout, "> "); + + fscanf(stdin, "%s", command); + + if (strncmp(command, "exit", MAX_COMMAND_SIZE) == 0) { + fprintf(stdout, "Quitting...\n"); + prompt = 0; + } else if (strncmp(command, "help", MAX_COMMAND_SIZE) == 0) { + print_help_text(); + } else if (strncmp(command, "weka", MAX_COMMAND_SIZE) == 0) { + if (train_file) { + if (weka_output(to_train, "procon_train.arff") != 0) { + fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); + } else { + fprintf(stdout, "Wrote training data to weka format.\n"); + } + } + if (classify_file) { + if (weka_output(to_classify, "procon_test.arff") != 0) { + fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); + } else { + fprintf(stdout, "Wrote test data to weka format.\n"); + } + } + } else if (strncmp(command, "load", MAX_COMMAND_SIZE) == 0) { + if (fscanf(stdin, "%s %s", p1, p2) == 2) { + if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) { + free_data(to_train); + to_train = read_data(p2); + fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count); + train_file = 1; + } else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) { + free_data(to_classify); + to_classify = read_data(p2); + fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count); + classify_file = 1; + } else if (strncmp(p1, "stop", MAX_COMMAND_SIZE) == 0) { + free(stop_word_array); + stop_word_array = load_stop_words(p2, &stop_word_count); + fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count); + stop_words = 1; + } else if (strncmp(p1, "model", MAX_COMMAND_SIZE) == 0) { + if (classifier != NULL) { + free_model(classifier); + } + classifier = load_model(p2); + if (classifier != NULL) { + fprintf(stdout, "Loaded model from %s\n", p2); + } + } + } else { + fprintf(stderr, "load must specify data type and a file name.\n"); + } + } else if (strncmp(command, "set", MAX_COMMAND_SIZE) == 0) { + if (fscanf(stdin, "%s %s", p1, p2) == 2) { + double p2_value = 0; + if (sscanf(p2, "%lf", &p2_value) == 1) { + if (strncmp(p1, "split", MAX_COMMAND_SIZE) == 0) { + train_test_split_percent = (int)p2_value; + } else if (strncmp(p1, "lrate", MAX_COMMAND_SIZE) == 0) { + learning_rate = p2_value; + fprintf(stdout,"The Learning Rate is now set to %lf.\n", learning_rate); + } else if (strncmp(p1, "hnodes", MAX_COMMAND_SIZE) == 0) { + hidden_nodes_per_layer = p2_value; + fprintf(stdout,"There will be %d nodes in each hidden layer.\n", hidden_nodes_per_layer); + } else if (strncmp(p1, "vector", MAX_COMMAND_SIZE) == 0) { + input_vector_size = p2_value; + fprintf(stdout,"The vector representation is now %d.\n", input_vector_size); + } else if (strncmp(p1, "hlayers", MAX_COMMAND_SIZE) == 0) { + network_layers = p2_value + 1; + fprintf(stdout,"There will be %d hidden layers.\n", (network_layers - 1)); + } else if (strncmp(p1, "epochs", MAX_COMMAND_SIZE) == 0) { + epochs = p2_value; + fprintf(stdout,"The maximum number of epochs is now %d.\n", epochs); + } + } else { + fprintf(stderr, "You must provide a valid integer value for set.\n"); + } + } else { + fprintf(stderr, "set must specify paramter and a value.\n"); + } + } else if (strncmp(command, "learn", MAX_COMMAND_SIZE) == 0) { + if (classifier != NULL) { + free_model(classifier); + } + if (!stop_words) { + stop_word_array = NULL; + } + + // Using the selected Train Data, determine the terms to be used for the vector + create_vector_represntation(to_train, stop_word_array, stop_word_count, input_vector_size); + + // Create Random Split + if (train_test_split(to_train, train_test_split_percent, train_set, test_set) != 0) { + fprintf(stderr, "Unable to create training and test sets.\n"); + return 1; + } + + // Create the vector representations of the training set, and test set + vector_representation(train_set, to_train->vector_terms, + to_train->vector_document_counts, input_vector_size); + + vector_representation(test_set, to_train->vector_terms, + to_train->vector_document_counts, input_vector_size); + + int nodes[network_layers]; + + for (int i = 0; i < network_layers; i++) { + if (i == 0) { + nodes[i] = input_vector_size; + } else { + nodes[i] = hidden_nodes_per_layer; + } + } + + classifier = create_model(learning_rate, network_layers, nodes, PRO_CON_OUTPUT); + + classifier = train_model(classifier, train_set, test_set, desired_test_error_min, epochs); + + classify_dataset(classifier, train_set); + classify_dataset(classifier, test_set); + + fprintf(stdout, "\nModel Performances on the training set\n"); + print_confusion_matrix(train_set); + fprintf(stdout, "\nModel Performances on the test set\n"); + print_confusion_matrix(test_set); + } else if (strncmp(command, "classify", MAX_COMMAND_SIZE) == 0) { + if (classify_file == 0) { + fprintf(stdout, "Please load a the dataset that should be classified\n"); + } else { + if (classifier == NULL) { + fprintf(stdout, "Please train the model to classify first\n"); + } else { + vector_representation(to_classify, to_train->vector_terms, + to_train->vector_document_counts, input_vector_size); + classify_dataset(classifier, to_classify); + } + } + + } else if (strncmp(command, "csv", MAX_COMMAND_SIZE) == 0) { + if (fscanf(stdin, "%s", p1) == 1) { + if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) { + char * train_csv = "training.csv"; + csv_output(to_train, train_csv); + fprintf(stdout, "Wrote training set to csv (%s)\n", train_csv); + } else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) { + char * classify_csv = "classify.csv"; + csv_output(to_classify, classify_csv); + fprintf(stdout, "Wrote classify set to csv (%s)\n", classify_csv); + } + } else { + fprintf(stderr, "csv must specify dataset to output.\n"); + } + } else if (strncmp(command, "save", MAX_COMMAND_SIZE) == 0) { + if (fscanf(stdin, "%s", p1) == 1) { + if (save_model(classifier, p1) == 0) { + fprintf(stdout, "Saved mode to %s\n", p1); + } + } else { + fprintf(stderr, "save needs a filename to save the model to.\n"); + } + } + } + + free(command); + free(p1); + free(p2); + + if (stop_words) { + free(stop_word_array); + }; + + if (classifier != NULL) { + free_model(classifier); + } + + if (to_train != NULL) { + free_data(to_train); + } + + if (to_classify != NULL) { + free_data(to_classify); + } + + return EXIT_SUCCESS; +} + +void usage(char * app) { + fprintf(stderr, "Usage: %s [-tT] [] []\n", app); + exit(EXIT_FAILURE); +} + +void print_help_text() { + fprintf(stdout, "Available commands are:\n"); + fprintf(stdout, "weka - no arguments, will write out training and classification data to .arff files\n"); + fprintf(stdout, "csv - will write out data to .csv files\n"); + fprintf(stdout, "load - load training data, classify data\n"); + fprintf(stdout, "set - set a value for any of the listed rates\n"); + fprintf(stdout, "learn - using the loaded training data, the set split and learning rate create a model\n"); + fprintf(stdout, "classify - using the learned model classify the loaded classify data\n"); + fprintf(stdout, "exit - quit the program\n"); +} diff --git a/nn.c b/nn.c new file mode 100644 index 0000000..2f32920 --- /dev/null +++ b/nn.c @@ -0,0 +1,753 @@ +/* + Author: James Griffin-Allwood + Date: March 13 2014 + + Description: Implementation of the learning system and model +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "process.h" +#include "nn.h" + +int free_matrix(matrix * to_free) { + if (to_free != NULL) { + if (to_free->weight_matrix != NULL) { + for (int i = 0; i < to_free->rows; i++) { + free(to_free->weight_matrix[i]); + } + free(to_free->weight_matrix); + } + + free(to_free); + } + return 0; +} + +int free_model(nn_model * to_free) { + if (to_free != NULL) { + if (to_free->nodes_per_layer != NULL) { + free(to_free->nodes_per_layer); + } + + if (to_free->layer_weights != NULL) { + for (int i = 0; i < to_free->layers; i++) { + free_matrix(to_free->layer_weights[i]); + } + } + if (to_free->previous_weight_updates != NULL) { + for (int i = 0; i < to_free->layers; i++) { + free_matrix(to_free->previous_weight_updates[i]); + } + } + reset_model_vectors(to_free); + free(to_free); + } + return 0; +} + +matrix * create_matrix(int r, int c) { + matrix * new; + + if ((new = calloc(1, sizeof(matrix))) == NULL) { + fprintf(stderr, "Unable to run the activation function\n"); + return NULL; + } + + if ((new->weight_matrix = calloc(r, sizeof(double *))) == NULL) { + fprintf(stderr, "Unable to run the activation function\n"); + return NULL; + } + + for (int i = 0; i < r; i++) { + if ((new->weight_matrix[i] = calloc(c, sizeof(double))) == NULL) { + fprintf(stderr, "Unable to run the activation function\n"); + return NULL; + } + } + + new->rows = r; + new->columns = c; + + return new; +} + +struct nn_model * create_model(const double rate, const int layers, const int layer_nodes[], const int outputs) { + nn_model * new_model; + matrix ** layer_weights; + matrix ** layer_inputs; + matrix ** layer_outputs; + + if ((new_model = calloc(1, sizeof(nn_model))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + new_model->previous_weight_updates = NULL; + new_model->momentum = DEFAULT_MOMENTUM; + new_model->learning_rate = rate; + new_model->layers = layers; + new_model->outputs = outputs; + + if ((new_model->nodes_per_layer = calloc(layers, sizeof(int))) == NULL) { + fprintf(stderr, "Unable to create model\n"); + return NULL; + } + + for (int i = 0; i < new_model->layers; i++) { + new_model->nodes_per_layer[i] = layer_nodes[i]; + } + + if ((layer_weights = calloc(layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + if ((layer_inputs = calloc(layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + if ((layer_outputs = calloc(layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // initialize the input and output vector arrays to NULL + for (int i = 0; i < layers; i++) { + layer_inputs[i] = NULL; + layer_outputs[i] = NULL; + } + + // Create the connection weight matricies between the layers (except last hidden layer and outputs) + for (int i = 0; i < (layers - 1); i++) { + if ((layer_weights[i] = create_matrix(layer_nodes[i], layer_nodes[i + 1])) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + } + + // Create connection weight matrix between last hidden layer and output + if ((layer_weights[layers - 1] = create_matrix(layer_nodes[layers - 1], outputs)) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // Initialize all weights in the network to random values between -0.5 and 0.5 + for (int i = 0; i < new_model->layers; i++) { + if (i == (new_model->layers - 1)) { + for (int j = 0; j < new_model->nodes_per_layer[i]; j++) { + for (int k = 0; k < new_model->outputs; k++) { + layer_weights[i]->weight_matrix[j][k] + = ((double)(arc4random_uniform(100) / 100.0) - 0.5); + } + } + } else { + for (int j = 0; j < new_model->nodes_per_layer[i]; j++) { + for (int k = 0; k < new_model->nodes_per_layer[i + 1]; k++) { + layer_weights[i]->weight_matrix[j][k] + = ((double)(arc4random_uniform(100) / 100.0) - 0.5); + } + } + } + } + + new_model->layer_weights = layer_weights; + new_model->layer_input_vectors = layer_inputs; + new_model->layer_output_vectors = layer_outputs; + new_model->output = NULL; + + new_model->previous_weight_updates = NULL; + + return new_model; +} + +struct nn_model * copy_model(const struct nn_model * model) { + nn_model * new_model; + + if ((new_model = calloc(1, sizeof(nn_model))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + new_model->momentum = model->momentum; + new_model->learning_rate = model->learning_rate; + new_model->layers = model->layers; + new_model->outputs = model->outputs; + + new_model->previous_weight_updates = NULL; + + if ((new_model->nodes_per_layer = calloc(new_model->layers, sizeof(int))) == NULL) { + fprintf(stderr, "Unable to copy model\n"); + return NULL; + } + + for (int i = 0; i < new_model->layers; i++) { + new_model->nodes_per_layer[i] = model->nodes_per_layer[i]; + } + + if ((new_model->layer_weights = calloc(new_model->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + if ((new_model->layer_input_vectors = calloc(new_model->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + if ((new_model->layer_output_vectors = calloc(new_model->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // initialize the input and output vector arrays to NULL + for (int i = 0; i < new_model->layers; i++) { + new_model->layer_input_vectors[i] = NULL; + new_model->layer_output_vectors[i] = NULL; + } + + if ((new_model->output = calloc(1, sizeof(matrix))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // Create the connection weight matricies between the layers (except last hidden layer and outputs) + for (int i = 0; i < (new_model->layers - 1); i++) { + if ((new_model->layer_weights[i] + = create_matrix(model->layer_weights[i]->rows, model->layer_weights[i]->columns)) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + } + + // Create connection weight, input and output matrix between last hidden layer and output + if ((new_model->layer_weights[new_model->layers - 1] = + create_matrix(model->nodes_per_layer[new_model->layers - 1], new_model->outputs)) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // copy all the network weights + for (int i = 0; i < new_model->layers; i++) { + if (i == (new_model->layers - 1)) { + for (int j = 0; j < new_model->nodes_per_layer[i]; j++) { + for (int k = 0; k < new_model->outputs; k++) { + new_model->layer_weights[i]->weight_matrix[j][k] + = model->layer_weights[i]->weight_matrix[j][k]; + } + } + } else { + for (int j = 0; j < new_model->nodes_per_layer[i]; j++) { + for (int k = 0; k < new_model->nodes_per_layer[i + 1]; k++) { + new_model->layer_weights[i]->weight_matrix[j][k] + = model->layer_weights[i]->weight_matrix[j][k]; + } + } + } + } + + return new_model; +} + +int save_model(const nn_model * m, const char * file) { + FILE * out; + + if ((out= fopen(file, "w")) == NULL) { + return 1; + } + + fprintf(out, "%lf\t", m->learning_rate); + fprintf(out, "%lf\t", m->momentum); + fprintf(out, "%d\t", m->layers); + fprintf(out, "%d\t\n", m->outputs); + for (int i = 0; i < m->layers; i++) { + fprintf(out, "%d\t", m->nodes_per_layer[i]); + } + for (int i = 0; i < m->layers; i++) { + for (int j = 0; j < m->layer_weights[i]->rows; j++) { + for (int k = 0; k < m->layer_weights[i]->columns; k++) { + fprintf(out, "%lf\t", m->layer_weights[i]->weight_matrix[j][k]); + } + fprintf(out, "\n"); + } + } + + if (fclose(out) == EOF) { + return 1; + } + + return 0; +} + +struct nn_model * load_model(char * file) { + FILE * temp; + nn_model * loaded; + + if ((temp = fopen(file, "r")) == NULL) { + fprintf(stderr, "Unable to load model from %s", file); + return NULL; + } + + if ((loaded = calloc(1, sizeof(nn_model))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + fscanf(temp, "%lf\t", &loaded->learning_rate); + fscanf(temp, "%lf\t", &loaded->momentum); + fscanf(temp, "%d\t", &loaded->layers); + fscanf(temp, "%d\t\n", &loaded->outputs); + + if ((loaded->nodes_per_layer = calloc(loaded->layers, sizeof(int))) == NULL) { + fprintf(stderr, "Unable to copy model\n"); + return NULL; + } + + for (int i = 0; i < loaded->layers; i++) { + fscanf(temp, "%d\t", &loaded->nodes_per_layer[i]); + } + + if ((loaded->layer_weights = calloc(loaded->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + for (int i = 0; i < loaded->layers; i++) { + int columns = 0; + int rows = loaded->nodes_per_layer[i]; + if (i != loaded->layers) { + columns = loaded->nodes_per_layer[i + 1]; + } else { + columns = loaded->outputs; + } + loaded->layer_weights[i] = create_matrix(rows, columns); + for (int j = 0; j < loaded->layer_weights[i]->rows; j++) { + for (int k = 0; k < loaded->layer_weights[i]->columns; k++) { + fscanf(temp, "%lf\t", &loaded->layer_weights[i]->weight_matrix[j][k]); + } + } + } + + if ((loaded->layer_input_vectors = calloc(loaded->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + if ((loaded->layer_output_vectors = calloc(loaded->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + // initialize the input and output vector arrays to NULL + for (int i = 0; i < loaded->layers; i++) { + loaded->layer_input_vectors[i] = NULL; + loaded->layer_output_vectors[i] = NULL; + } + + if ((loaded->output = calloc(1, sizeof(matrix))) == NULL) { + fprintf(stderr, "Not enough memory for model\n"); + return NULL; + } + + loaded->output = NULL; + loaded->previous_weight_updates = NULL; + + return loaded; +} + +matrix * multiply_matricies(const matrix * a, const matrix * b) { + matrix * result; + + if (a->columns != b->rows) { + fprintf(stderr, "Unable to multiply these matricies\n"); + return NULL; + } + + if ((result = create_matrix(a->rows, b->columns)) == NULL) { + fprintf(stderr, "Unable to allocated memory formatrix result\n"); + return NULL; + } + + for (int i = 0; i < result->rows; i++) { + for (int j = 0; j < result->columns; j++) { + double value = 0.0; + for (int k = 0; k < a->columns; k++) { + value += a->weight_matrix[i][k] * b->weight_matrix[k][j]; + } + result->weight_matrix[i][j] = value; + } + } + + return result; +} + +int add_matricies(matrix * a, const matrix * b) { + if (a->rows != b->rows || a->columns != b->columns) { + fprintf(stderr, "Unable to add matricies\n"); + return 1; + } + + for (int i = 0; i < a->rows; i++) { + for (int j = 0; j < a->columns; j++) { + a->weight_matrix[i][j] = a->weight_matrix[i][j] + b->weight_matrix[i][j]; + } + } + return 0; +} + +matrix * activation_function(const matrix * input_vector) { + matrix * output_vector; + + if (input_vector->rows != 1) { + fprintf(stderr, "input vectors for a layer must be nx1\n"); + return NULL; + } + + if ((output_vector = create_matrix(input_vector->rows, input_vector->columns)) == NULL) { + fprintf(stderr, "Unable to run the activation function\n"); + return NULL; + } + + for (int i = 0; i < input_vector->columns; i++) { + double sigmoid = 1 / (1 + exp(-1 * input_vector->weight_matrix[0][i])); + output_vector->weight_matrix[0][i] = sigmoid; + } + + return output_vector; +} + +int classify_instance(nn_model * current, message * input, const int size) { + int layers = current->layers; + matrix * input_vector; + matrix * output_vector; + double bias = 1.0; + + reset_model_vectors(current); + + if ((input_vector = create_matrix(1, size)) == NULL) { + fprintf(stderr, "Unable to classify the instance\n"); + return 1; + } + + for (int i = 0; i < input_vector->columns; i++) { + input_vector->weight_matrix[0][i] = input->text_vector[i]; + } + current->layer_output_vectors[0] = input_vector; + + for (int i = 0; i < layers; i++) { + // add bias input + current->layer_output_vectors[i]->weight_matrix[0][current->layer_output_vectors[i]->columns - 1] = bias; + current->layer_input_vectors[i] = + multiply_matricies(current->layer_output_vectors[i], current->layer_weights[i]); + if (current->layer_input_vectors[i] == NULL) { + fprintf(stderr, "Unable to classify\n"); + return 1; + } + + if (i != (layers - 1)) { + current->layer_output_vectors[i + 1] = activation_function(current->layer_input_vectors[i]); + if (current->layer_output_vectors[i + 1] == NULL) { + fprintf(stderr, "Unable to classify\n"); + return 1; + } + } + } + output_vector = activation_function(current->layer_input_vectors[layers - 1]); + current->output = output_vector; + if (output_vector == NULL) { + fprintf(stderr, "Unable to classify\n"); + return 1; + } + + for (int i = 0; i < PRO_CON_OUTPUT; i++) { + input->prediction_probability[i] = current->output->weight_matrix[0][i]; + } + + if (input->prediction_probability[0] > input->prediction_probability[1]) { + input->prediction = CON; + } else { + input->prediction = PRO; + } + + return 0; +} + +int reset_model_vectors(nn_model * m) { + free_matrix(m->output); + m->output = NULL; + for (int i = 0; i < m->layers; i++) { + if (m->layer_input_vectors[i] != NULL) { + free_matrix(m->layer_input_vectors[i]); + m->layer_input_vectors[i] = NULL; + } + if (m->layer_output_vectors[i] != NULL) { + free_matrix(m->layer_output_vectors[i]); + m->layer_output_vectors[i] = NULL; + } + } + + return 0; +} + +int backprop_update(nn_model * update, matrix * output) { + matrix * output_error; + matrix ** hidden_error; + matrix ** weight_updates; + int use_momentum = 0; + int hidden_layers = update->layers - 1; + + if ((output_error = create_matrix(1, PRO_CON_OUTPUT)) == NULL) { + fprintf(stderr, "Unable to allocate memory for error term\n"); + return 1; + } + + // allocated enough memory for an error matrix for every layer (except input layer) + if ((hidden_error = calloc(hidden_layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for hidden errors\n"); + return 1; + } + + // allocate enough memory for all the weight updates + if ((weight_updates = calloc(update->layers, sizeof(matrix *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for error updates\n"); + return 1; + } + + for (int i = 0; i < update->layers; i++) { + weight_updates[i] = create_matrix(update->layer_weights[i]->rows, update->layer_weights[i]->columns); + if (weight_updates[i] == NULL) { + fprintf(stderr, "Unable to store weight updates\n"); + return 1; + } + } + + // Compute the error outputs + for (int i = 0; i < PRO_CON_OUTPUT; i++) { + output_error->weight_matrix[0][i] = + (update->output->weight_matrix[0][i] + * (1 - update->output->weight_matrix[0][i]) + * (output->weight_matrix[0][i] - update->output->weight_matrix[0][i])); + } + + for (int i = hidden_layers; i > 0; i--) { + int forward_nodes = 0; + matrix * forward_error; + if (i == hidden_layers) { + forward_nodes = update->outputs; + forward_error = output_error; + } else { + forward_nodes = update->nodes_per_layer[i + 1]; + forward_error = hidden_error[i]; + } + + hidden_error[i - 1] = create_matrix(1, update->nodes_per_layer[i]); + if (hidden_error[i - 1] == NULL) { + fprintf(stderr, "Unable to allocate memory for hidden layer errors\n"); + return 1; + } + + for (int j = 0; j < update->nodes_per_layer[i]; j++) { + double error_sum = 0.0; + for (int k = 0; k < forward_nodes; k++) { + + error_sum += + update->layer_weights[i]->weight_matrix[j][k] * forward_error->weight_matrix[0][k]; + } + + hidden_error[i - 1]->weight_matrix[0][j] = + (update->layer_output_vectors[i]->weight_matrix[0][j] + * (1 - update->layer_output_vectors[i]->weight_matrix[0][j]) + * error_sum); + } + } + + if (update->previous_weight_updates != NULL) { + use_momentum = 1; + } + + // Compute Weight Updates + for (int i = update->layers; i > 0; i--) { + matrix * forward_error; + if (i == update->layers) { + forward_error = output_error; + } else { + forward_error = hidden_error[i - 1]; + } + for (int j = 0; j < update->layer_weights[i - 1]->rows; j++) { + for (int k = 0; k < update->layer_weights[i - 1]->columns; k++) { + double momentum_term = 0.0; + if (use_momentum) { + momentum_term = update->momentum * update->previous_weight_updates[i - 1]->weight_matrix[j][k]; + } + weight_updates[i - 1]->weight_matrix[j][k] = + (update->learning_rate + * forward_error->weight_matrix[0][k] + * update->layer_output_vectors[i - 1]->weight_matrix[0][j]) + + momentum_term; + update->layer_weights[i - 1]->weight_matrix[j][k] += weight_updates[i - 1]->weight_matrix[j][k]; + } + } + } + + free_matrix(output_error); + + if (hidden_error != NULL) { + for (int i = 0; i < hidden_layers; i++) { + free_matrix(hidden_error[i]); + } + } + + if (update->previous_weight_updates != NULL) { + for (int i = 0; i < update->layers; i++) { + free_matrix(update->previous_weight_updates[i]); + } + } + update->previous_weight_updates = weight_updates; + + return 0; +} + +nn_model * train_model(nn_model * m, data * training_data, data * test_data, double error_rate, int epoch_max) { + int epochs = 0; + double test_error_rate = 1; + double last_error_rate = 1; + int epochs_error_increase = 0; + nn_model * best = NULL; + + while (epochs < epoch_max) { + matrix * expected_result = create_matrix(1, m->outputs); + double correctly_classified_test = 0; + double correctly_classified_train = 0; + double current_error_rate = 1; + double train_error_rate = 1; + + if (epochs_error_increase > EPOCH_MAX_ERROR_INCREASE) { + break; + } + + // Run all of the instances through the network to train + for (int i = 0; i < training_data->count; i++) { + classify_instance(m, training_data->instances[i], m->nodes_per_layer[0]); + + if (training_data->instances[i]->class == PRO) { + expected_result->weight_matrix[0][0] = 0; + expected_result->weight_matrix[0][1] = 1; + } else { + expected_result->weight_matrix[0][0] = 1; + expected_result->weight_matrix[0][1] = 0; + } + backprop_update(m, expected_result); + } + free_matrix(expected_result); + + // Compute the training error rate for this epoch + for (int i = 0; i < training_data->count; i++) { + if (training_data->instances[i]->class == training_data->instances[i]->prediction) { + correctly_classified_train++; + } + } + train_error_rate = (1 - (correctly_classified_train / training_data->count)); + + // Classify the Test Set and compute error rate for this epoch. + classify_dataset(m, test_data); + for (int i = 0; i < test_data->count; i++) { + if (test_data->instances[i]->class == test_data->instances[i]->prediction) { + correctly_classified_test++; + } + } + current_error_rate = (1 - (correctly_classified_test / test_data->count)); + + // Check to see if the error rate is a new minimum + if (current_error_rate < test_error_rate) { + test_error_rate = current_error_rate; + fprintf(stdout, "Epoch %3d: New best test error rate found %lf\n", epochs, test_error_rate); + free_model(best); + best = copy_model(m); + epochs_error_increase = 0; + } else if (current_error_rate >= last_error_rate) { + epochs_error_increase++; + } else { + epochs_error_increase = 0; + } + + // Print out error rates for plotting every 10 epochs + if ((epochs % 5) == 0) { + fprintf(stdout, "Epoch %3d:\ttrain error:%lf\ttest error:%lf\n", + epochs, train_error_rate, current_error_rate); + } + last_error_rate = current_error_rate; + epochs++; + } + + // Report the error rate before returning. + fprintf(stdout, "Trained model to an test error rate of %lf\n", test_error_rate); + free_model(m); + return best; +} + +int classify_dataset(nn_model * m, data * set) { + // Use Grand Central Dispatch and Blocks to multithread this task for performance + dispatch_apply(set->count, dispatch_get_global_queue(0, 0), ^ (size_t i) { + nn_model * classifier = copy_model(m); + classify_instance(classifier, set->instances[i], classifier->nodes_per_layer[0]); + free_model(classifier); + }); + + // int pro = 0; + // int con = 0; + // for (int i = 0; i < set->count; i++) { + // if (set->instances[i]->prediction == PRO) { + // pro++; + // } else { + // con++; + // } + // } + // fprintf(stdout, "Classified %d Pros and %d Cons\n", pro, con); + + return 0; +} + +void print_confusion_matrix(data * set) { + matrix * confusion_matrix = create_matrix(2, 2); + char * class_label[] = { "Con", "Pro" }; + + for (int i = 0; i < set->count; i++) { + confusion_matrix->weight_matrix[set->instances[i]->class][set->instances[i]->prediction]++; + } + + print_matrix(confusion_matrix, class_label); + double accuracy + = ((confusion_matrix->weight_matrix[0][0] + confusion_matrix->weight_matrix[1][1]) / set->count) + * 100; + fprintf(stdout, "The model corretly classified %.2lf%% of the instances\n", accuracy); + free_matrix(confusion_matrix); +} + +void print_matrix(matrix * m, char ** labels) { + int labeled = 0; + if (labels != NULL) { + labeled = 1; + } + + if (labeled) { + for (int i = 0; i < m->rows; i++) { + fprintf(stdout, "%d\t", i); + } + fprintf(stdout, "| <- Classified as\n"); + } + + for (int i = 0; i < m->rows; i++) { + for (int j = 0; j < m->columns; j++) { + fprintf(stdout, "%.0lf\t", m->weight_matrix[i][j]); + } + if (labeled) { + fprintf(stdout, "| %d - %s", i, labels[i]); + } + fprintf(stdout, "\n"); + } +} \ No newline at end of file diff --git a/nn.h b/nn.h new file mode 100644 index 0000000..eec7d9c --- /dev/null +++ b/nn.h @@ -0,0 +1,171 @@ +#ifndef NNMODEL +#define NNMODEL + +#define DEFAULT_LAYERS 2 +#define DEFAULT_HIDDEN_NODES_PER_LAYER 20 +#define DEFAULT_LEARNING_RATE 0.1 +#define DEFAULT_MOMENTUM 0.9 +#define DEFAULT_TEST_ERROR_MIN 0.3 +#define DEFAULT_MAX_EPOCHS_SINCE_MIN 500 +#define EPOCH_MAX_ERROR_INCREASE 20 + +/* + A struct containing a 2 dimensional array storing connection weights +*/ +typedef struct matrix { + double ** weight_matrix; + int rows; + int columns; +} matrix; + +/* + A struct representing the network and parameters +*/ +typedef struct nn_model { + double learning_rate; + double momentum; + int layers; + int * nodes_per_layer; + int outputs; + matrix ** layer_weights; + matrix ** layer_input_vectors; + matrix ** layer_output_vectors; + matrix * output; + matrix ** previous_weight_updates; +} nn_model; + +/* + A function that will free all the memory allocated for a matrix struct + @param to_free The data struct that should be free'd + @return 0 if it is free'd successfully +*/ +int free_matrix(matrix * to_free); + +/* + A function that will free all the memory allocated for a model struct + @param to_free The data struct that should be free'd + @return 0 if it is free'd successfully +*/ +int free_model(nn_model * to_free); + +/* + A function that allocates memory for a matrix + @param r The number of rows + @param c The number of columns +*/ +matrix * create_matrix(int r, int c); + +/* + A function that creates a new neural network model. + @param rate The network learning rate + @param layers The number of layers of nodes (inclusive of input layer) + @param layernodes An array of size layers, containing the number of nodes per layer + @param outputs the number of nodes in the output layer + @return A new network with random weights +*/ +struct nn_model * create_model(const double rate, const int layers, const int layer_nodes[], const int outputs); + +/* + A method that creates a copy of a model + @param model The model to be copied + @return A copy of the model that was passed. +*/ +struct nn_model * copy_model(const struct nn_model * model); + +/* + A function that writes a well performing and trained model to txt file + @param m The model to be saved + @param file The file name for the saved model + @return 0 if it wrote correctly +*/ +int save_model(const nn_model * m, const char * file); + +/* + A method that reads a saved model in from a file + @param file The file name for the saved model + @return A trained nn_model with the weights from the saved file. +*/ +struct nn_model * load_model(char * file); + +/* + A function that takes two matricies and multiplies them together, returning a new matrix + @param a Matrix 1 to be considered + @param a Matrix 2 to be considered + @result the product of matrix multiplication +*/ +matrix * multiply_matricies(const matrix * a, const matrix * b); + +/* + Add the contents of matrix b to matrix a. + @param a A matrix to be modified + @param b A matrix of values to add to the first matrix + @return 0 if successfully modified the first matrix +*/ +int add_matricies(matrix * a, const matrix * b); +/* + A function that takes an input vector representing a layer of nodes and uses the Sigmoid + activation functoin to create values that will be used for the next layer + @param input_vector the input vector to be activated for the next later + @return a matrix containing an 1xn vector of sigmoid values of the input vector +*/ +matrix * activation_function(const matrix * input_vector); + +/* + Classify a provided message using the currently learned model. + @param current The model to be used to classify + @param input The input vector to be classified by the model + @return 0 if vector has been classified +*/ +int classify_instance(nn_model * current, message * input, const int size); + +/* + Free the memory used by layer value vectors. + This function is for use after classifying. + @param m The model whose input and output vectors should be freed + @return 0 if the models vectors are reset for another run +*/ +int reset_model_vectors(nn_model * m); + +/* + Propagate error back through the network + @param model the model whose weights need to be updated + @param outputs the desired outputs given the current values + @return 0 if model weights are updated correctly +*/ +int backprop_update(nn_model * update, matrix * output); + +/* + Train the network using backpropagation using the training_data and validating + against the test_data + @param m The model to traing + @param training_data The dataset of training instances + @param test_data The dataset of the test instances + @param error_rate The minimum test data error rate accepted + @param epoch_max The maximum number of times to continue training without finding a minimum + @return 0 when an acceptible threshold of training has been reached. +*/ +nn_model * train_model(nn_model * m, data * training_data, data * test_data, double error_rate, int epoch_max); + +/* + A function that classifies all the instances in the dataset + @param m The model that is to be used for classification + @param set The dataset whose instances are to be classified + @return 0 If the set is successfully classified +*/ +int classify_dataset(nn_model * m, data * set); + +/* + A function that prints a confusion matrix for a dataset of how many of each instance + are classifed as what, as well as some stats + @param set The dataset whose confusion matrix is to be printed +*/ +void print_confusion_matrix(data * set); + +/* + A function that prints a matrix with optional column labels + @param m The matrix that is to be printed + @param labels The string labels to be output in the case that there are some +*/ +void print_matrix(matrix * m, char ** labels); + +#endif diff --git a/process.c b/process.c new file mode 100644 index 0000000..0407a05 --- /dev/null +++ b/process.c @@ -0,0 +1,507 @@ +/* + Author: James Griffin-Allwood + Date: March 4 2014 + + Description: Implementations of reading in messages formatted +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "process.h" + +typedef struct dictionary_word { + char * word; + int count; + int document_count; +} dictionary_word; + +int free_data(data * to_free) { + if (to_free == NULL) { + return 1; + } + + if (to_free->instances != NULL) { + free(to_free->instances); + } + + if (to_free->vector_terms != NULL){ + free(to_free->vector_terms); + } + + free(to_free); + + return 0; +} + +struct data * read_data(char * file) { + FILE * temp; + data * data_buffer; + char * line; + char * message_buffer; + int class = -1; + + char * pro = ""; + char * pro_close = ""; + char * con = ""; + char * con_close = ""; + char * unknown = ""; + char * unknown_close = ""; + + int lines = 0; + int max_line_size = 0; + int line_count = 0; + char c; + + if ((temp = fopen(file, "r")) == NULL) { + exit(EXIT_FAILURE); + } + + while ((c = fgetc(temp)) != EOF) { + line_count++; + if (c == '\n') { + ++lines; + if (line_count > max_line_size) + max_line_size = line_count; + line_count = 0; + } + } + + rewind(temp); + + if ((data_buffer = calloc(1, sizeof(data))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + exit(EXIT_FAILURE); + } + + if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + exit(EXIT_FAILURE); + } + + if ((line = malloc(sizeof(char) * max_line_size)) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be read\n"); + } + + data_buffer->count = lines; + + for (int i = 0; i < lines; i++) { + if (fgets(line, max_line_size, temp) != NULL) { + if (strstr(line, pro) != NULL) { + char * start = strstr(line, pro) + (sizeof(char) * strlen(pro)); + char * end = strstr(line, pro_close); + message_buffer = strndup(start, end - start); + class = PRO; + } else if (strstr(line, con) != NULL) { + char * start = strstr(line, con) + (sizeof(char) * strlen(con)); + char * end = strstr(line, con_close); + message_buffer = strndup(start, end - start); + class = CON; + } else if (strstr(line, unknown) != NULL) { + char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown)); + char * end = strstr(line, unknown_close); + message_buffer = strndup(start, end - start); + class = UNKNOWN; + }else { + message_buffer = ""; + } + + data_buffer->instances[i] = calloc(1, sizeof(message)); + + data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer)); + data_buffer->instances[i]->class = class; + } + } + + free(line); + + if (fclose(temp) == EOF) { + exit(EXIT_FAILURE); + } + + return data_buffer; +} + +int weka_output(data * print_data, char * out_file) { + FILE * out; + + if ((out= fopen(out_file, "w")) == NULL) { + return 1; + } + + fprintf(out, "@relation 'Pro/Con Message Classification'\n"); + fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n"); + + for (int i = 0; i < print_data->count; i++) { + if (print_data->instances[i]->class == UNKNOWN) { + fprintf(out, "'%s',?\n", + escape_single_quote(print_data->instances[i]->text)); + } else { + fprintf(out, "'%s',%d\n", + escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class); + } + } + + if (fclose(out) == EOF) { + return 1; + } + + return 0; +} + +int csv_output(data * print_data, char * out_file) { + FILE * out; + char * pro = "Pro"; + char * con = "Con"; + + if ((out= fopen(out_file, "w")) == NULL) { + return 1; + } + + for (int i = 0; i < print_data->count; i++) { + if (print_data->instances[i]->prediction == UNKNOWN) { + fprintf(out, "?,'%s'\n", + escape_single_quote(print_data->instances[i]->text)); + } else { + if (print_data->instances[i]->prediction_probability[0] + > print_data->instances[i]->prediction_probability[1]) { + fprintf(out, "%s,'%s'\n", con, + escape_single_quote(print_data->instances[i]->text)); + } else { + fprintf(out, "%s,'%s'\n", pro, + escape_single_quote(print_data->instances[i]->text)); + } + } + } + + if (fclose(out) == EOF) { + return 1; + } + + return 0; +} + +char * escape_single_quote(const char *str) { + char * ret, * r; + const char * p, * q; + size_t oldlen = strlen("'"); + size_t count, retlen, newlen = strlen("\\'"); + + for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){ + count++; + } + retlen = p - str + strlen(p) + count * (newlen - oldlen); + + if ((ret = malloc(retlen + 1)) == NULL) + return NULL; + + for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) { + ptrdiff_t l = q - p; + memcpy(r, p, l); + r += l; + memcpy(r, "\\'", newlen); + r += newlen; + } + strcpy(r, p); + + return ret; +} + +char ** load_stop_words(char * filename, int * word_count) { + FILE * temp; + char * line; + int index = 0; + char ** words; + + * word_count = 0; + + if ((temp = fopen(filename, "r")) == NULL) { + exit(EXIT_FAILURE); + } + + if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) { + fprintf(stderr, "Unable to allocate memory for stop words to be read\n"); + return NULL; + } + while (fscanf(temp, "%s\n", line) == 1) { + (* word_count)++; + } + + rewind(temp); + + if ((words = calloc(* word_count, sizeof(char *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for stop words\n"); + return NULL; + } + + while (fscanf(temp, "%s", line) == 1) { + words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH)); + index++; + } + + free(line); + + if (fclose(temp) == EOF) { + exit(EXIT_FAILURE); + } + + return words; +} + +int train_test_split(const data * dataset, const int percent, data * train, data * test) { + int total_instances = dataset->count; + double train_percent = (100 - percent) / 100.0; + int train_instances = (int)(total_instances * train_percent); + int test_instances = total_instances - train_instances; + int train_index = 0; + int test_indexes[test_instances]; + + if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + + if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + + for (int i = 0; i < test_instances; i++) { + int random_index = -1; + int new_index = 0; + + while (!new_index) { + int is_found = 0; + random_index = arc4random_uniform(total_instances); + for (int j = 0; j < i; j++) { + if (test_indexes[j] == random_index) { + is_found = 1; + } + } + if (!is_found) { + new_index = 1; + } + } + + test_indexes[i] = random_index; + if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + test->instances[i]->text + = strndup(dataset->instances[random_index]->text, + strlen(dataset->instances[random_index]->text)); + + test->instances[i]->class = dataset->instances[random_index]->class; + test->instances[i]->prediction = dataset->instances[random_index]->prediction; + memcpy(test->instances[i]->text_vector, + dataset->instances[random_index]->text_vector, + sizeof(dataset->instances[random_index]->text_vector)); + } + + test->count = test_instances; + + for (int i = 0; i < total_instances; i++) { + int is_test = 0; + + for (int j = 0; j < test_instances; j++) { + if (i == test_indexes[j]) { + is_test = 1; + } + } + + if (!is_test) { + if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) { + fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); + return 1; + } + train->instances[train_index]->text + = strndup(dataset->instances[i]->text, + strlen(dataset->instances[i]->text)); + train->instances[train_index]->class = dataset->instances[i]->class; + train->instances[train_index]->prediction = dataset->instances[i]->prediction; + memcpy(train->instances[train_index]->text_vector, + dataset->instances[i]->text_vector, + sizeof(dataset->instances[i]->text_vector)); + + train_index++; + } + } + + train->count = train_instances; + + return 0; +} + +int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) { + dictionary_word ** dictionary; + int word_count = 0; + int allocated = size; + + // 0 out the vector counts for dataset + for (int i = 0; i < size; i++) { + dataset->vector_document_counts[i] = 0; + } + + if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for dictionary\n"); + return 1; + } + + fprintf(stdout,"%d instances considered for vector\n", dataset->count); + + for (int i = 0; i < dataset->count; i++) { + char * token, * string; + string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text)); + int * terms_per_document; + if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL) { + fprintf(stderr, "Unable to create vector\n"); + return 1; + } + + if (string == NULL) { + fprintf(stderr, "Unable to parse message for terms\n"); + return 1; + } + + while ((token = strsep(&string, ".,?! ")) != NULL) { + int word_found = 0; + int is_stop_word = 0; + + if (strcmp(token, "") != 0) { + for (int j = 0; j < word_count; j++) { + if (strcasecmp(token, dictionary[j]->word) == 0) { + dictionary[j]->count++; + terms_per_document[j]++; + word_found = 1; + } + } + + if (stop_words != NULL) { + for (int j = 0; j < stop_word_count; j++) { + if (strcasecmp(token, stop_words[j]) == 0) { + is_stop_word = 1; + } + } + } + + if (!word_found && !is_stop_word) { + word_count++; + if (word_count > allocated) { + dictionary_word ** more_words; + if ((more_words = realloc(dictionary, + sizeof(dictionary_word *) * (2 * word_count))) == NULL) { + fprintf(stderr, "Unable to allocate memory for dictionary\n"); + return 1; + } + + dictionary = more_words; + allocated = 2 * word_count; + + int * more_document_counts; + if ((more_document_counts = realloc(terms_per_document, + sizeof(int) * allocated)) == NULL) { + fprintf(stderr, "Unable to allocate memory for term frequencies\n"); + return 1; + } + terms_per_document = more_document_counts; + } + + if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) { + fprintf(stderr, "Unable to allocate memory for dictionary word\n"); + return 1; + } + if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) { + fprintf(stderr, "Unable to allocate memory for dictionary word\n"); + return 1; + } + + strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1); + dictionary[word_count - 1]->count = 1; + terms_per_document[word_count - 1] = 1; + } + } + } + + for (int j = 0; j < word_count; j++) { + if (terms_per_document[j] > 0) { + dictionary[j]->document_count++; + } + } + + free(string); + } + + qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings); + + if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) { + fprintf(stderr, "Unable to allocate memory for Model Vector\n"); + return 1; + } + + for (int i = 0; i < size; i++) { + if (i < word_count) { + dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word)); + dataset->vector_document_counts[i] = dictionary[i]->document_count; + } else { + dataset->vector_terms[i] = strdup(""); + dataset->vector_document_counts[i] = 0; + } + + } + + fprintf(stdout, "Found %d different words\n", word_count); + + free(dictionary); + + return 0; +} + +int compare_strings(const void * a, const void * b) { + const dictionary_word * word_a = *(dictionary_word **) a; + const dictionary_word * word_b = *(dictionary_word **) b; + + return (word_b->count - word_a->count); +} + +int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) { + // Use Grand Central Dispatch and Blocks to multithread this task for performance + dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) { + char * token, * string; + string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text)); + + if (string == NULL) { + fprintf(stderr, "Unable to parse message for terms\n"); + } + + for (int index = 0; index < size; index++) { + dataset->instances[i]->text_vector[index] = 0; + } + + while ((token = strsep(&string, ".,?! ")) != NULL) { + for (int index = 0; index < size; index++) { + if (strcasecmp(token, vector_terms[index]) == 0) { + dataset->instances[i]->text_vector[index]++; + } + } + } + + for (int index = 0; index < size; index++) { + double tf = dataset->instances[i]->text_vector[index]; + double docs = 1; + if (vector_document_counts[i] != 0) { + docs = vector_document_counts[i]; + } + double idf = log(dataset->count/docs); + double tfidf = tf * idf; + + dataset->instances[i]->text_vector[index] = tfidf; + } + }); + + return 0; +} diff --git a/process.h b/process.h new file mode 100644 index 0000000..fd8968f --- /dev/null +++ b/process.h @@ -0,0 +1,121 @@ +#ifndef PROCESS +#define PROCESS + +#define PRO 1 +#define CON 0 +#define PRO_CON_OUTPUT 2 +#define UNKNOWN 2 +#define VECTOR_SIZE 1000 +#define MAX_TERM_LENGTH 128 + +/* + A struct that contains a text message and a PRO or CON classification +*/ +typedef struct message { + char * text; + double text_vector[VECTOR_SIZE]; + int class; + int prediction; + double prediction_probability[PRO_CON_OUTPUT]; +} message; + +/* + A struct that contains all of the messages used for training for testing +*/ +typedef struct data { + message ** instances; + char ** vector_terms; + int vector_document_counts[VECTOR_SIZE]; + int count; +} data; + +/* + A function that will free all the memory allocated for a data struct + @param to_free The data struct that should be free'd + @return 0 if it is free'd successfully +*/ +int free_data(data * to_free); + +/* + A function that takes a file name return a data struct that contains the messages + (and classifications if provided) + @param file: The name of a file of data to be read into a data structure. + @return A pointer to a struct containing an array of message structs and their + classifications +*/ +data * read_data(char * file); + +/* + Output data into a weka format + @param print_data: The data struct to be printed + @param out_file: The file where the weka arff should be written + @return 0 if successfully output +*/ +int weka_output(data * print_data, char * out_file); + +/* + Output data into a csv file with 1 instance per line + @param print_data: The data struct to be printed + @param out_file: The file where the weka arff should be written + @return 0 if successfully output +*/ +int csv_output(data * print_data, char * out_file); +/* + A function for escaping single quotes in a string. + Based off generic code found http://creativeandcritical.net/str-replace-c/ + Modified to only escapse ''s + @param str The string to escape + @return An escaped string. +*/ +char * escape_single_quote(const char *str); + +/* + A function that reads in a collection of stop words from a file with 1 word per line. + @param filename The name of the file to be parsed + @param word_count A pointer where the number of stop words should be stored + @return the array of words +*/ +char ** load_stop_words(char * filename, int * word_count); + +/* + A function that takes a dataset, a percentage that should be reserved for testing. + The function requires 2 data pointers for storing the resulting train and test sets + @param dataset The data to be split + @param percent The percent of the data to be used for TESTING + @param train The data that will be used for training + @param test The data that will be used for testing + @return 0 if the new datasets are created without issue. non zero otherwise. +*/ +int train_test_split(const data * dataset, const int percent, data * train, data * test); + +/* + A function that parses through all the supplied messages in a data struct + and determines the most relevant terms in the training data. These will be + used in the construction of a vector representation of any specfic message + @param dataset The collection of messages to be considered + @param stop_words an array of stopwords to ignore + @param stop_word_count the number of stop words. + @return 0 if the vector representation is found and the vectors created for + all messages +*/ +int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size); + +/* + A simple function that compares word counts based on an array of words + and a second array of the counts. + @param a word 1 + @param b word 2 + @return -1, 0, 1 if a is less than, equal to, or greater than b +*/ +int compare_strings(const void * a, const void * b); + +/* + A function that parses through all the supplied messages in a data struct + and determines the vector representation based on the supplied vector terms + @param dataset The collection of messages to be considered + @param vector_terms The array of terms that make up the vector + @return 0 if the vectors created for all messages +*/ +int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size); + +#endif