initial github commit
Implementation of backprop in C using Grand Central Dispatch and Blocks
This commit is contained in:
16
Makefile
Normal file
16
Makefile
Normal file
@@ -0,0 +1,16 @@
|
||||
CC=clang
|
||||
CFLAGS= -Wall
|
||||
|
||||
default: main.o process.o nn.o
|
||||
$(CC) process.o nn.o main.o -o procon
|
||||
|
||||
main.o: main.c
|
||||
$(CC) $(CFLAGS) -c main.c
|
||||
|
||||
process.o: process.c
|
||||
$(CC) $(CFLAGS) -c process.c
|
||||
|
||||
nn.o: nn.c
|
||||
$(CC) $(CFLAGS) -c nn.c
|
||||
clean:
|
||||
rm *.o procon
|
358
main.c
Normal file
358
main.c
Normal file
@@ -0,0 +1,358 @@
|
||||
/*
|
||||
Author: James Griffin-Allwood
|
||||
Date: March 4 2014
|
||||
|
||||
Description:
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <dispatch/dispatch.h>
|
||||
#include "process.h"
|
||||
#include "nn.h"
|
||||
|
||||
#define MAX_PARAM_SIZE 255
|
||||
#define MAX_COMMAND_SIZE 10
|
||||
#define DEFAULT_TRAIN_TEST_SPLIT 25
|
||||
|
||||
/*
|
||||
A function that prints usage information for this application
|
||||
*/
|
||||
void usage(char * app);
|
||||
|
||||
/*
|
||||
A function that prints command information
|
||||
*/
|
||||
void print_help_text();
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
char * train;
|
||||
char * classify;
|
||||
char * stop;
|
||||
char ** stop_word_array;
|
||||
|
||||
int prompt = 0, train_file = 0, classify_file = 0, stop_words = 0;
|
||||
int stop_word_count = 0;
|
||||
int train_test_split_percent = DEFAULT_TRAIN_TEST_SPLIT;
|
||||
int network_layers = DEFAULT_LAYERS;
|
||||
int input_vector_size = VECTOR_SIZE;
|
||||
double learning_rate = DEFAULT_LEARNING_RATE;
|
||||
double desired_test_error_min = DEFAULT_TEST_ERROR_MIN;
|
||||
int epochs = DEFAULT_MAX_EPOCHS_SINCE_MIN;
|
||||
int hidden_nodes_per_layer = DEFAULT_HIDDEN_NODES_PER_LAYER;
|
||||
data * to_train = NULL, * to_classify = NULL, * train_set = NULL, * test_set = NULL;
|
||||
nn_model * classifier = NULL;
|
||||
|
||||
// Process cli arguments and determine if there were flags.
|
||||
// If no flags and just file names load data and write weka files
|
||||
if (argc == 1) {
|
||||
prompt = 1;
|
||||
} else if (argc > 1) {
|
||||
if (argv[1][0] == '-') {
|
||||
int params = strlen(argv[1]);
|
||||
prompt = 1;
|
||||
|
||||
for (int i = 1; i < params; i++) {
|
||||
switch (argv[1][i]) {
|
||||
case 't':
|
||||
train_file = 1;
|
||||
break;
|
||||
case 'c':
|
||||
classify_file = 1;
|
||||
break;
|
||||
case 's':
|
||||
stop_words = 1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (argc == 3) {
|
||||
classify = argv[2];
|
||||
to_classify = read_data(classify);
|
||||
if (weka_output(to_classify, "procon_classify.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_classify.arff\n");
|
||||
}
|
||||
free_data(to_classify);
|
||||
}
|
||||
train = argv[1];
|
||||
to_train = read_data(train);
|
||||
if (weka_output(to_train, "procon_train.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
}
|
||||
free_data(to_train);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
if ((train_set = calloc(1, sizeof(data))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((test_set = calloc(1, sizeof(data))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If flags are set for commandline training or test data get file names and read data
|
||||
if (train_file) {
|
||||
if (!classify_file) {
|
||||
train = argv[argc - 1];
|
||||
} else {
|
||||
train = argv[argc - 2];
|
||||
}
|
||||
to_train = read_data(train);
|
||||
fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count);
|
||||
}
|
||||
|
||||
if (classify_file) {
|
||||
classify = argv[argc - 1];
|
||||
to_classify = read_data(classify);
|
||||
fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count);
|
||||
}
|
||||
|
||||
if (stop_words) {
|
||||
if (!classify_file) {
|
||||
if (!train_file) {
|
||||
stop = argv[argc - 1];
|
||||
} else {
|
||||
stop = argv[argc - 2];
|
||||
}
|
||||
} else {
|
||||
if (!train_file) {
|
||||
stop = argv[argc - 2];
|
||||
} else {
|
||||
stop = argv[argc - 3];
|
||||
}
|
||||
}
|
||||
|
||||
stop_word_array = load_stop_words(stop, &stop_word_count);
|
||||
fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count);
|
||||
}
|
||||
|
||||
/*
|
||||
Begin terminal interface. Exit on "exit". This interface allows you to load,
|
||||
specify learning parameters, learn, classify, write output data.
|
||||
*/
|
||||
fprintf(stdout, "Pro/Con Learner.\nEnter commands below. (type 'help' for commands)\n");
|
||||
char * command, * p1, * p2;
|
||||
|
||||
if ((command = malloc(sizeof(char) * MAX_COMMAND_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for command");
|
||||
}
|
||||
|
||||
if ((p1 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for parameters");
|
||||
}
|
||||
|
||||
if ((p2 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for parameters");
|
||||
}
|
||||
|
||||
while (prompt) {
|
||||
fprintf(stdout, "> ");
|
||||
|
||||
fscanf(stdin, "%s", command);
|
||||
|
||||
if (strncmp(command, "exit", MAX_COMMAND_SIZE) == 0) {
|
||||
fprintf(stdout, "Quitting...\n");
|
||||
prompt = 0;
|
||||
} else if (strncmp(command, "help", MAX_COMMAND_SIZE) == 0) {
|
||||
print_help_text();
|
||||
} else if (strncmp(command, "weka", MAX_COMMAND_SIZE) == 0) {
|
||||
if (train_file) {
|
||||
if (weka_output(to_train, "procon_train.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
} else {
|
||||
fprintf(stdout, "Wrote training data to weka format.\n");
|
||||
}
|
||||
}
|
||||
if (classify_file) {
|
||||
if (weka_output(to_classify, "procon_test.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
} else {
|
||||
fprintf(stdout, "Wrote test data to weka format.\n");
|
||||
}
|
||||
}
|
||||
} else if (strncmp(command, "load", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s %s", p1, p2) == 2) {
|
||||
if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) {
|
||||
free_data(to_train);
|
||||
to_train = read_data(p2);
|
||||
fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count);
|
||||
train_file = 1;
|
||||
} else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
free_data(to_classify);
|
||||
to_classify = read_data(p2);
|
||||
fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count);
|
||||
classify_file = 1;
|
||||
} else if (strncmp(p1, "stop", MAX_COMMAND_SIZE) == 0) {
|
||||
free(stop_word_array);
|
||||
stop_word_array = load_stop_words(p2, &stop_word_count);
|
||||
fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count);
|
||||
stop_words = 1;
|
||||
} else if (strncmp(p1, "model", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
classifier = load_model(p2);
|
||||
if (classifier != NULL) {
|
||||
fprintf(stdout, "Loaded model from %s\n", p2);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "load must specify data type and a file name.\n");
|
||||
}
|
||||
} else if (strncmp(command, "set", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s %s", p1, p2) == 2) {
|
||||
double p2_value = 0;
|
||||
if (sscanf(p2, "%lf", &p2_value) == 1) {
|
||||
if (strncmp(p1, "split", MAX_COMMAND_SIZE) == 0) {
|
||||
train_test_split_percent = (int)p2_value;
|
||||
} else if (strncmp(p1, "lrate", MAX_COMMAND_SIZE) == 0) {
|
||||
learning_rate = p2_value;
|
||||
fprintf(stdout,"The Learning Rate is now set to %lf.\n", learning_rate);
|
||||
} else if (strncmp(p1, "hnodes", MAX_COMMAND_SIZE) == 0) {
|
||||
hidden_nodes_per_layer = p2_value;
|
||||
fprintf(stdout,"There will be %d nodes in each hidden layer.\n", hidden_nodes_per_layer);
|
||||
} else if (strncmp(p1, "vector", MAX_COMMAND_SIZE) == 0) {
|
||||
input_vector_size = p2_value;
|
||||
fprintf(stdout,"The vector representation is now %d.\n", input_vector_size);
|
||||
} else if (strncmp(p1, "hlayers", MAX_COMMAND_SIZE) == 0) {
|
||||
network_layers = p2_value + 1;
|
||||
fprintf(stdout,"There will be %d hidden layers.\n", (network_layers - 1));
|
||||
} else if (strncmp(p1, "epochs", MAX_COMMAND_SIZE) == 0) {
|
||||
epochs = p2_value;
|
||||
fprintf(stdout,"The maximum number of epochs is now %d.\n", epochs);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "You must provide a valid integer value for set.\n");
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "set must specify paramter and a value.\n");
|
||||
}
|
||||
} else if (strncmp(command, "learn", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
if (!stop_words) {
|
||||
stop_word_array = NULL;
|
||||
}
|
||||
|
||||
// Using the selected Train Data, determine the terms to be used for the vector
|
||||
create_vector_represntation(to_train, stop_word_array, stop_word_count, input_vector_size);
|
||||
|
||||
// Create Random Split
|
||||
if (train_test_split(to_train, train_test_split_percent, train_set, test_set) != 0) {
|
||||
fprintf(stderr, "Unable to create training and test sets.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Create the vector representations of the training set, and test set
|
||||
vector_representation(train_set, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
|
||||
vector_representation(test_set, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
|
||||
int nodes[network_layers];
|
||||
|
||||
for (int i = 0; i < network_layers; i++) {
|
||||
if (i == 0) {
|
||||
nodes[i] = input_vector_size;
|
||||
} else {
|
||||
nodes[i] = hidden_nodes_per_layer;
|
||||
}
|
||||
}
|
||||
|
||||
classifier = create_model(learning_rate, network_layers, nodes, PRO_CON_OUTPUT);
|
||||
|
||||
classifier = train_model(classifier, train_set, test_set, desired_test_error_min, epochs);
|
||||
|
||||
classify_dataset(classifier, train_set);
|
||||
classify_dataset(classifier, test_set);
|
||||
|
||||
fprintf(stdout, "\nModel Performances on the training set\n");
|
||||
print_confusion_matrix(train_set);
|
||||
fprintf(stdout, "\nModel Performances on the test set\n");
|
||||
print_confusion_matrix(test_set);
|
||||
} else if (strncmp(command, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classify_file == 0) {
|
||||
fprintf(stdout, "Please load a the dataset that should be classified\n");
|
||||
} else {
|
||||
if (classifier == NULL) {
|
||||
fprintf(stdout, "Please train the model to classify first\n");
|
||||
} else {
|
||||
vector_representation(to_classify, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
classify_dataset(classifier, to_classify);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (strncmp(command, "csv", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s", p1) == 1) {
|
||||
if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) {
|
||||
char * train_csv = "training.csv";
|
||||
csv_output(to_train, train_csv);
|
||||
fprintf(stdout, "Wrote training set to csv (%s)\n", train_csv);
|
||||
} else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
char * classify_csv = "classify.csv";
|
||||
csv_output(to_classify, classify_csv);
|
||||
fprintf(stdout, "Wrote classify set to csv (%s)\n", classify_csv);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "csv must specify dataset to output.\n");
|
||||
}
|
||||
} else if (strncmp(command, "save", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s", p1) == 1) {
|
||||
if (save_model(classifier, p1) == 0) {
|
||||
fprintf(stdout, "Saved mode to %s\n", p1);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "save needs a filename to save the model to.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(command);
|
||||
free(p1);
|
||||
free(p2);
|
||||
|
||||
if (stop_words) {
|
||||
free(stop_word_array);
|
||||
};
|
||||
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
|
||||
if (to_train != NULL) {
|
||||
free_data(to_train);
|
||||
}
|
||||
|
||||
if (to_classify != NULL) {
|
||||
free_data(to_classify);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
void usage(char * app) {
|
||||
fprintf(stderr, "Usage: %s [-tT] [<training data>] [<test data>]\n", app);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
void print_help_text() {
|
||||
fprintf(stdout, "Available commands are:\n");
|
||||
fprintf(stdout, "weka - no arguments, will write out training and classification data to .arff files\n");
|
||||
fprintf(stdout, "csv <train|classify> - will write out data to .csv files\n");
|
||||
fprintf(stdout, "load <train|classify|stop> <path to file> - load training data, classify data\n");
|
||||
fprintf(stdout, "set <split|lrate|hnodes|hlayers|vector|epochs> <number> - set a value for any of the listed rates\n");
|
||||
fprintf(stdout, "learn - using the loaded training data, the set split and learning rate create a model\n");
|
||||
fprintf(stdout, "classify - using the learned model classify the loaded classify data\n");
|
||||
fprintf(stdout, "exit - quit the program\n");
|
||||
}
|
753
nn.c
Normal file
753
nn.c
Normal file
@@ -0,0 +1,753 @@
|
||||
/*
|
||||
Author: James Griffin-Allwood
|
||||
Date: March 13 2014
|
||||
|
||||
Description: Implementation of the learning system and model
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include <math.h>
|
||||
#include <Block.h>
|
||||
#include <dispatch/dispatch.h>
|
||||
#include "process.h"
|
||||
#include "nn.h"
|
||||
|
||||
int free_matrix(matrix * to_free) {
|
||||
if (to_free != NULL) {
|
||||
if (to_free->weight_matrix != NULL) {
|
||||
for (int i = 0; i < to_free->rows; i++) {
|
||||
free(to_free->weight_matrix[i]);
|
||||
}
|
||||
free(to_free->weight_matrix);
|
||||
}
|
||||
|
||||
free(to_free);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int free_model(nn_model * to_free) {
|
||||
if (to_free != NULL) {
|
||||
if (to_free->nodes_per_layer != NULL) {
|
||||
free(to_free->nodes_per_layer);
|
||||
}
|
||||
|
||||
if (to_free->layer_weights != NULL) {
|
||||
for (int i = 0; i < to_free->layers; i++) {
|
||||
free_matrix(to_free->layer_weights[i]);
|
||||
}
|
||||
}
|
||||
if (to_free->previous_weight_updates != NULL) {
|
||||
for (int i = 0; i < to_free->layers; i++) {
|
||||
free_matrix(to_free->previous_weight_updates[i]);
|
||||
}
|
||||
}
|
||||
reset_model_vectors(to_free);
|
||||
free(to_free);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
matrix * create_matrix(int r, int c) {
|
||||
matrix * new;
|
||||
|
||||
if ((new = calloc(1, sizeof(matrix))) == NULL) {
|
||||
fprintf(stderr, "Unable to run the activation function\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((new->weight_matrix = calloc(r, sizeof(double *))) == NULL) {
|
||||
fprintf(stderr, "Unable to run the activation function\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < r; i++) {
|
||||
if ((new->weight_matrix[i] = calloc(c, sizeof(double))) == NULL) {
|
||||
fprintf(stderr, "Unable to run the activation function\n");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
new->rows = r;
|
||||
new->columns = c;
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
struct nn_model * create_model(const double rate, const int layers, const int layer_nodes[], const int outputs) {
|
||||
nn_model * new_model;
|
||||
matrix ** layer_weights;
|
||||
matrix ** layer_inputs;
|
||||
matrix ** layer_outputs;
|
||||
|
||||
if ((new_model = calloc(1, sizeof(nn_model))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
new_model->previous_weight_updates = NULL;
|
||||
new_model->momentum = DEFAULT_MOMENTUM;
|
||||
new_model->learning_rate = rate;
|
||||
new_model->layers = layers;
|
||||
new_model->outputs = outputs;
|
||||
|
||||
if ((new_model->nodes_per_layer = calloc(layers, sizeof(int))) == NULL) {
|
||||
fprintf(stderr, "Unable to create model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < new_model->layers; i++) {
|
||||
new_model->nodes_per_layer[i] = layer_nodes[i];
|
||||
}
|
||||
|
||||
if ((layer_weights = calloc(layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((layer_inputs = calloc(layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((layer_outputs = calloc(layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// initialize the input and output vector arrays to NULL
|
||||
for (int i = 0; i < layers; i++) {
|
||||
layer_inputs[i] = NULL;
|
||||
layer_outputs[i] = NULL;
|
||||
}
|
||||
|
||||
// Create the connection weight matricies between the layers (except last hidden layer and outputs)
|
||||
for (int i = 0; i < (layers - 1); i++) {
|
||||
if ((layer_weights[i] = create_matrix(layer_nodes[i], layer_nodes[i + 1])) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Create connection weight matrix between last hidden layer and output
|
||||
if ((layer_weights[layers - 1] = create_matrix(layer_nodes[layers - 1], outputs)) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Initialize all weights in the network to random values between -0.5 and 0.5
|
||||
for (int i = 0; i < new_model->layers; i++) {
|
||||
if (i == (new_model->layers - 1)) {
|
||||
for (int j = 0; j < new_model->nodes_per_layer[i]; j++) {
|
||||
for (int k = 0; k < new_model->outputs; k++) {
|
||||
layer_weights[i]->weight_matrix[j][k]
|
||||
= ((double)(arc4random_uniform(100) / 100.0) - 0.5);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < new_model->nodes_per_layer[i]; j++) {
|
||||
for (int k = 0; k < new_model->nodes_per_layer[i + 1]; k++) {
|
||||
layer_weights[i]->weight_matrix[j][k]
|
||||
= ((double)(arc4random_uniform(100) / 100.0) - 0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new_model->layer_weights = layer_weights;
|
||||
new_model->layer_input_vectors = layer_inputs;
|
||||
new_model->layer_output_vectors = layer_outputs;
|
||||
new_model->output = NULL;
|
||||
|
||||
new_model->previous_weight_updates = NULL;
|
||||
|
||||
return new_model;
|
||||
}
|
||||
|
||||
struct nn_model * copy_model(const struct nn_model * model) {
|
||||
nn_model * new_model;
|
||||
|
||||
if ((new_model = calloc(1, sizeof(nn_model))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
new_model->momentum = model->momentum;
|
||||
new_model->learning_rate = model->learning_rate;
|
||||
new_model->layers = model->layers;
|
||||
new_model->outputs = model->outputs;
|
||||
|
||||
new_model->previous_weight_updates = NULL;
|
||||
|
||||
if ((new_model->nodes_per_layer = calloc(new_model->layers, sizeof(int))) == NULL) {
|
||||
fprintf(stderr, "Unable to copy model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < new_model->layers; i++) {
|
||||
new_model->nodes_per_layer[i] = model->nodes_per_layer[i];
|
||||
}
|
||||
|
||||
if ((new_model->layer_weights = calloc(new_model->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((new_model->layer_input_vectors = calloc(new_model->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((new_model->layer_output_vectors = calloc(new_model->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// initialize the input and output vector arrays to NULL
|
||||
for (int i = 0; i < new_model->layers; i++) {
|
||||
new_model->layer_input_vectors[i] = NULL;
|
||||
new_model->layer_output_vectors[i] = NULL;
|
||||
}
|
||||
|
||||
if ((new_model->output = calloc(1, sizeof(matrix))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Create the connection weight matricies between the layers (except last hidden layer and outputs)
|
||||
for (int i = 0; i < (new_model->layers - 1); i++) {
|
||||
if ((new_model->layer_weights[i]
|
||||
= create_matrix(model->layer_weights[i]->rows, model->layer_weights[i]->columns)) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Create connection weight, input and output matrix between last hidden layer and output
|
||||
if ((new_model->layer_weights[new_model->layers - 1] =
|
||||
create_matrix(model->nodes_per_layer[new_model->layers - 1], new_model->outputs)) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// copy all the network weights
|
||||
for (int i = 0; i < new_model->layers; i++) {
|
||||
if (i == (new_model->layers - 1)) {
|
||||
for (int j = 0; j < new_model->nodes_per_layer[i]; j++) {
|
||||
for (int k = 0; k < new_model->outputs; k++) {
|
||||
new_model->layer_weights[i]->weight_matrix[j][k]
|
||||
= model->layer_weights[i]->weight_matrix[j][k];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < new_model->nodes_per_layer[i]; j++) {
|
||||
for (int k = 0; k < new_model->nodes_per_layer[i + 1]; k++) {
|
||||
new_model->layer_weights[i]->weight_matrix[j][k]
|
||||
= model->layer_weights[i]->weight_matrix[j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new_model;
|
||||
}
|
||||
|
||||
int save_model(const nn_model * m, const char * file) {
|
||||
FILE * out;
|
||||
|
||||
if ((out= fopen(file, "w")) == NULL) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(out, "%lf\t", m->learning_rate);
|
||||
fprintf(out, "%lf\t", m->momentum);
|
||||
fprintf(out, "%d\t", m->layers);
|
||||
fprintf(out, "%d\t\n", m->outputs);
|
||||
for (int i = 0; i < m->layers; i++) {
|
||||
fprintf(out, "%d\t", m->nodes_per_layer[i]);
|
||||
}
|
||||
for (int i = 0; i < m->layers; i++) {
|
||||
for (int j = 0; j < m->layer_weights[i]->rows; j++) {
|
||||
for (int k = 0; k < m->layer_weights[i]->columns; k++) {
|
||||
fprintf(out, "%lf\t", m->layer_weights[i]->weight_matrix[j][k]);
|
||||
}
|
||||
fprintf(out, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (fclose(out) == EOF) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct nn_model * load_model(char * file) {
|
||||
FILE * temp;
|
||||
nn_model * loaded;
|
||||
|
||||
if ((temp = fopen(file, "r")) == NULL) {
|
||||
fprintf(stderr, "Unable to load model from %s", file);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((loaded = calloc(1, sizeof(nn_model))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fscanf(temp, "%lf\t", &loaded->learning_rate);
|
||||
fscanf(temp, "%lf\t", &loaded->momentum);
|
||||
fscanf(temp, "%d\t", &loaded->layers);
|
||||
fscanf(temp, "%d\t\n", &loaded->outputs);
|
||||
|
||||
if ((loaded->nodes_per_layer = calloc(loaded->layers, sizeof(int))) == NULL) {
|
||||
fprintf(stderr, "Unable to copy model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loaded->layers; i++) {
|
||||
fscanf(temp, "%d\t", &loaded->nodes_per_layer[i]);
|
||||
}
|
||||
|
||||
if ((loaded->layer_weights = calloc(loaded->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loaded->layers; i++) {
|
||||
int columns = 0;
|
||||
int rows = loaded->nodes_per_layer[i];
|
||||
if (i != loaded->layers) {
|
||||
columns = loaded->nodes_per_layer[i + 1];
|
||||
} else {
|
||||
columns = loaded->outputs;
|
||||
}
|
||||
loaded->layer_weights[i] = create_matrix(rows, columns);
|
||||
for (int j = 0; j < loaded->layer_weights[i]->rows; j++) {
|
||||
for (int k = 0; k < loaded->layer_weights[i]->columns; k++) {
|
||||
fscanf(temp, "%lf\t", &loaded->layer_weights[i]->weight_matrix[j][k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((loaded->layer_input_vectors = calloc(loaded->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((loaded->layer_output_vectors = calloc(loaded->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// initialize the input and output vector arrays to NULL
|
||||
for (int i = 0; i < loaded->layers; i++) {
|
||||
loaded->layer_input_vectors[i] = NULL;
|
||||
loaded->layer_output_vectors[i] = NULL;
|
||||
}
|
||||
|
||||
if ((loaded->output = calloc(1, sizeof(matrix))) == NULL) {
|
||||
fprintf(stderr, "Not enough memory for model\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
loaded->output = NULL;
|
||||
loaded->previous_weight_updates = NULL;
|
||||
|
||||
return loaded;
|
||||
}
|
||||
|
||||
matrix * multiply_matricies(const matrix * a, const matrix * b) {
|
||||
matrix * result;
|
||||
|
||||
if (a->columns != b->rows) {
|
||||
fprintf(stderr, "Unable to multiply these matricies\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((result = create_matrix(a->rows, b->columns)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocated memory formatrix result\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < result->rows; i++) {
|
||||
for (int j = 0; j < result->columns; j++) {
|
||||
double value = 0.0;
|
||||
for (int k = 0; k < a->columns; k++) {
|
||||
value += a->weight_matrix[i][k] * b->weight_matrix[k][j];
|
||||
}
|
||||
result->weight_matrix[i][j] = value;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int add_matricies(matrix * a, const matrix * b) {
|
||||
if (a->rows != b->rows || a->columns != b->columns) {
|
||||
fprintf(stderr, "Unable to add matricies\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < a->rows; i++) {
|
||||
for (int j = 0; j < a->columns; j++) {
|
||||
a->weight_matrix[i][j] = a->weight_matrix[i][j] + b->weight_matrix[i][j];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
matrix * activation_function(const matrix * input_vector) {
|
||||
matrix * output_vector;
|
||||
|
||||
if (input_vector->rows != 1) {
|
||||
fprintf(stderr, "input vectors for a layer must be nx1\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((output_vector = create_matrix(input_vector->rows, input_vector->columns)) == NULL) {
|
||||
fprintf(stderr, "Unable to run the activation function\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_vector->columns; i++) {
|
||||
double sigmoid = 1 / (1 + exp(-1 * input_vector->weight_matrix[0][i]));
|
||||
output_vector->weight_matrix[0][i] = sigmoid;
|
||||
}
|
||||
|
||||
return output_vector;
|
||||
}
|
||||
|
||||
int classify_instance(nn_model * current, message * input, const int size) {
|
||||
int layers = current->layers;
|
||||
matrix * input_vector;
|
||||
matrix * output_vector;
|
||||
double bias = 1.0;
|
||||
|
||||
reset_model_vectors(current);
|
||||
|
||||
if ((input_vector = create_matrix(1, size)) == NULL) {
|
||||
fprintf(stderr, "Unable to classify the instance\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_vector->columns; i++) {
|
||||
input_vector->weight_matrix[0][i] = input->text_vector[i];
|
||||
}
|
||||
current->layer_output_vectors[0] = input_vector;
|
||||
|
||||
for (int i = 0; i < layers; i++) {
|
||||
// add bias input
|
||||
current->layer_output_vectors[i]->weight_matrix[0][current->layer_output_vectors[i]->columns - 1] = bias;
|
||||
current->layer_input_vectors[i] =
|
||||
multiply_matricies(current->layer_output_vectors[i], current->layer_weights[i]);
|
||||
if (current->layer_input_vectors[i] == NULL) {
|
||||
fprintf(stderr, "Unable to classify\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (i != (layers - 1)) {
|
||||
current->layer_output_vectors[i + 1] = activation_function(current->layer_input_vectors[i]);
|
||||
if (current->layer_output_vectors[i + 1] == NULL) {
|
||||
fprintf(stderr, "Unable to classify\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
output_vector = activation_function(current->layer_input_vectors[layers - 1]);
|
||||
current->output = output_vector;
|
||||
if (output_vector == NULL) {
|
||||
fprintf(stderr, "Unable to classify\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < PRO_CON_OUTPUT; i++) {
|
||||
input->prediction_probability[i] = current->output->weight_matrix[0][i];
|
||||
}
|
||||
|
||||
if (input->prediction_probability[0] > input->prediction_probability[1]) {
|
||||
input->prediction = CON;
|
||||
} else {
|
||||
input->prediction = PRO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int reset_model_vectors(nn_model * m) {
|
||||
free_matrix(m->output);
|
||||
m->output = NULL;
|
||||
for (int i = 0; i < m->layers; i++) {
|
||||
if (m->layer_input_vectors[i] != NULL) {
|
||||
free_matrix(m->layer_input_vectors[i]);
|
||||
m->layer_input_vectors[i] = NULL;
|
||||
}
|
||||
if (m->layer_output_vectors[i] != NULL) {
|
||||
free_matrix(m->layer_output_vectors[i]);
|
||||
m->layer_output_vectors[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int backprop_update(nn_model * update, matrix * output) {
|
||||
matrix * output_error;
|
||||
matrix ** hidden_error;
|
||||
matrix ** weight_updates;
|
||||
int use_momentum = 0;
|
||||
int hidden_layers = update->layers - 1;
|
||||
|
||||
if ((output_error = create_matrix(1, PRO_CON_OUTPUT)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for error term\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// allocated enough memory for an error matrix for every layer (except input layer)
|
||||
if ((hidden_error = calloc(hidden_layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for hidden errors\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// allocate enough memory for all the weight updates
|
||||
if ((weight_updates = calloc(update->layers, sizeof(matrix *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for error updates\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < update->layers; i++) {
|
||||
weight_updates[i] = create_matrix(update->layer_weights[i]->rows, update->layer_weights[i]->columns);
|
||||
if (weight_updates[i] == NULL) {
|
||||
fprintf(stderr, "Unable to store weight updates\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the error outputs
|
||||
for (int i = 0; i < PRO_CON_OUTPUT; i++) {
|
||||
output_error->weight_matrix[0][i] =
|
||||
(update->output->weight_matrix[0][i]
|
||||
* (1 - update->output->weight_matrix[0][i])
|
||||
* (output->weight_matrix[0][i] - update->output->weight_matrix[0][i]));
|
||||
}
|
||||
|
||||
for (int i = hidden_layers; i > 0; i--) {
|
||||
int forward_nodes = 0;
|
||||
matrix * forward_error;
|
||||
if (i == hidden_layers) {
|
||||
forward_nodes = update->outputs;
|
||||
forward_error = output_error;
|
||||
} else {
|
||||
forward_nodes = update->nodes_per_layer[i + 1];
|
||||
forward_error = hidden_error[i];
|
||||
}
|
||||
|
||||
hidden_error[i - 1] = create_matrix(1, update->nodes_per_layer[i]);
|
||||
if (hidden_error[i - 1] == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for hidden layer errors\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int j = 0; j < update->nodes_per_layer[i]; j++) {
|
||||
double error_sum = 0.0;
|
||||
for (int k = 0; k < forward_nodes; k++) {
|
||||
|
||||
error_sum +=
|
||||
update->layer_weights[i]->weight_matrix[j][k] * forward_error->weight_matrix[0][k];
|
||||
}
|
||||
|
||||
hidden_error[i - 1]->weight_matrix[0][j] =
|
||||
(update->layer_output_vectors[i]->weight_matrix[0][j]
|
||||
* (1 - update->layer_output_vectors[i]->weight_matrix[0][j])
|
||||
* error_sum);
|
||||
}
|
||||
}
|
||||
|
||||
if (update->previous_weight_updates != NULL) {
|
||||
use_momentum = 1;
|
||||
}
|
||||
|
||||
// Compute Weight Updates
|
||||
for (int i = update->layers; i > 0; i--) {
|
||||
matrix * forward_error;
|
||||
if (i == update->layers) {
|
||||
forward_error = output_error;
|
||||
} else {
|
||||
forward_error = hidden_error[i - 1];
|
||||
}
|
||||
for (int j = 0; j < update->layer_weights[i - 1]->rows; j++) {
|
||||
for (int k = 0; k < update->layer_weights[i - 1]->columns; k++) {
|
||||
double momentum_term = 0.0;
|
||||
if (use_momentum) {
|
||||
momentum_term = update->momentum * update->previous_weight_updates[i - 1]->weight_matrix[j][k];
|
||||
}
|
||||
weight_updates[i - 1]->weight_matrix[j][k] =
|
||||
(update->learning_rate
|
||||
* forward_error->weight_matrix[0][k]
|
||||
* update->layer_output_vectors[i - 1]->weight_matrix[0][j])
|
||||
+ momentum_term;
|
||||
update->layer_weights[i - 1]->weight_matrix[j][k] += weight_updates[i - 1]->weight_matrix[j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free_matrix(output_error);
|
||||
|
||||
if (hidden_error != NULL) {
|
||||
for (int i = 0; i < hidden_layers; i++) {
|
||||
free_matrix(hidden_error[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (update->previous_weight_updates != NULL) {
|
||||
for (int i = 0; i < update->layers; i++) {
|
||||
free_matrix(update->previous_weight_updates[i]);
|
||||
}
|
||||
}
|
||||
update->previous_weight_updates = weight_updates;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
nn_model * train_model(nn_model * m, data * training_data, data * test_data, double error_rate, int epoch_max) {
|
||||
int epochs = 0;
|
||||
double test_error_rate = 1;
|
||||
double last_error_rate = 1;
|
||||
int epochs_error_increase = 0;
|
||||
nn_model * best = NULL;
|
||||
|
||||
while (epochs < epoch_max) {
|
||||
matrix * expected_result = create_matrix(1, m->outputs);
|
||||
double correctly_classified_test = 0;
|
||||
double correctly_classified_train = 0;
|
||||
double current_error_rate = 1;
|
||||
double train_error_rate = 1;
|
||||
|
||||
if (epochs_error_increase > EPOCH_MAX_ERROR_INCREASE) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Run all of the instances through the network to train
|
||||
for (int i = 0; i < training_data->count; i++) {
|
||||
classify_instance(m, training_data->instances[i], m->nodes_per_layer[0]);
|
||||
|
||||
if (training_data->instances[i]->class == PRO) {
|
||||
expected_result->weight_matrix[0][0] = 0;
|
||||
expected_result->weight_matrix[0][1] = 1;
|
||||
} else {
|
||||
expected_result->weight_matrix[0][0] = 1;
|
||||
expected_result->weight_matrix[0][1] = 0;
|
||||
}
|
||||
backprop_update(m, expected_result);
|
||||
}
|
||||
free_matrix(expected_result);
|
||||
|
||||
// Compute the training error rate for this epoch
|
||||
for (int i = 0; i < training_data->count; i++) {
|
||||
if (training_data->instances[i]->class == training_data->instances[i]->prediction) {
|
||||
correctly_classified_train++;
|
||||
}
|
||||
}
|
||||
train_error_rate = (1 - (correctly_classified_train / training_data->count));
|
||||
|
||||
// Classify the Test Set and compute error rate for this epoch.
|
||||
classify_dataset(m, test_data);
|
||||
for (int i = 0; i < test_data->count; i++) {
|
||||
if (test_data->instances[i]->class == test_data->instances[i]->prediction) {
|
||||
correctly_classified_test++;
|
||||
}
|
||||
}
|
||||
current_error_rate = (1 - (correctly_classified_test / test_data->count));
|
||||
|
||||
// Check to see if the error rate is a new minimum
|
||||
if (current_error_rate < test_error_rate) {
|
||||
test_error_rate = current_error_rate;
|
||||
fprintf(stdout, "Epoch %3d: New best test error rate found %lf\n", epochs, test_error_rate);
|
||||
free_model(best);
|
||||
best = copy_model(m);
|
||||
epochs_error_increase = 0;
|
||||
} else if (current_error_rate >= last_error_rate) {
|
||||
epochs_error_increase++;
|
||||
} else {
|
||||
epochs_error_increase = 0;
|
||||
}
|
||||
|
||||
// Print out error rates for plotting every 10 epochs
|
||||
if ((epochs % 5) == 0) {
|
||||
fprintf(stdout, "Epoch %3d:\ttrain error:%lf\ttest error:%lf\n",
|
||||
epochs, train_error_rate, current_error_rate);
|
||||
}
|
||||
last_error_rate = current_error_rate;
|
||||
epochs++;
|
||||
}
|
||||
|
||||
// Report the error rate before returning.
|
||||
fprintf(stdout, "Trained model to an test error rate of %lf\n", test_error_rate);
|
||||
free_model(m);
|
||||
return best;
|
||||
}
|
||||
|
||||
int classify_dataset(nn_model * m, data * set) {
|
||||
// Use Grand Central Dispatch and Blocks to multithread this task for performance
|
||||
dispatch_apply(set->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
|
||||
nn_model * classifier = copy_model(m);
|
||||
classify_instance(classifier, set->instances[i], classifier->nodes_per_layer[0]);
|
||||
free_model(classifier);
|
||||
});
|
||||
|
||||
// int pro = 0;
|
||||
// int con = 0;
|
||||
// for (int i = 0; i < set->count; i++) {
|
||||
// if (set->instances[i]->prediction == PRO) {
|
||||
// pro++;
|
||||
// } else {
|
||||
// con++;
|
||||
// }
|
||||
// }
|
||||
// fprintf(stdout, "Classified %d Pros and %d Cons\n", pro, con);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print_confusion_matrix(data * set) {
|
||||
matrix * confusion_matrix = create_matrix(2, 2);
|
||||
char * class_label[] = { "Con", "Pro" };
|
||||
|
||||
for (int i = 0; i < set->count; i++) {
|
||||
confusion_matrix->weight_matrix[set->instances[i]->class][set->instances[i]->prediction]++;
|
||||
}
|
||||
|
||||
print_matrix(confusion_matrix, class_label);
|
||||
double accuracy
|
||||
= ((confusion_matrix->weight_matrix[0][0] + confusion_matrix->weight_matrix[1][1]) / set->count)
|
||||
* 100;
|
||||
fprintf(stdout, "The model corretly classified %.2lf%% of the instances\n", accuracy);
|
||||
free_matrix(confusion_matrix);
|
||||
}
|
||||
|
||||
void print_matrix(matrix * m, char ** labels) {
|
||||
int labeled = 0;
|
||||
if (labels != NULL) {
|
||||
labeled = 1;
|
||||
}
|
||||
|
||||
if (labeled) {
|
||||
for (int i = 0; i < m->rows; i++) {
|
||||
fprintf(stdout, "%d\t", i);
|
||||
}
|
||||
fprintf(stdout, "| <- Classified as\n");
|
||||
}
|
||||
|
||||
for (int i = 0; i < m->rows; i++) {
|
||||
for (int j = 0; j < m->columns; j++) {
|
||||
fprintf(stdout, "%.0lf\t", m->weight_matrix[i][j]);
|
||||
}
|
||||
if (labeled) {
|
||||
fprintf(stdout, "| %d - %s", i, labels[i]);
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
}
|
171
nn.h
Normal file
171
nn.h
Normal file
@@ -0,0 +1,171 @@
|
||||
#ifndef NNMODEL
|
||||
#define NNMODEL
|
||||
|
||||
#define DEFAULT_LAYERS 2
|
||||
#define DEFAULT_HIDDEN_NODES_PER_LAYER 20
|
||||
#define DEFAULT_LEARNING_RATE 0.1
|
||||
#define DEFAULT_MOMENTUM 0.9
|
||||
#define DEFAULT_TEST_ERROR_MIN 0.3
|
||||
#define DEFAULT_MAX_EPOCHS_SINCE_MIN 500
|
||||
#define EPOCH_MAX_ERROR_INCREASE 20
|
||||
|
||||
/*
|
||||
A struct containing a 2 dimensional array storing connection weights
|
||||
*/
|
||||
typedef struct matrix {
|
||||
double ** weight_matrix;
|
||||
int rows;
|
||||
int columns;
|
||||
} matrix;
|
||||
|
||||
/*
|
||||
A struct representing the network and parameters
|
||||
*/
|
||||
typedef struct nn_model {
|
||||
double learning_rate;
|
||||
double momentum;
|
||||
int layers;
|
||||
int * nodes_per_layer;
|
||||
int outputs;
|
||||
matrix ** layer_weights;
|
||||
matrix ** layer_input_vectors;
|
||||
matrix ** layer_output_vectors;
|
||||
matrix * output;
|
||||
matrix ** previous_weight_updates;
|
||||
} nn_model;
|
||||
|
||||
/*
|
||||
A function that will free all the memory allocated for a matrix struct
|
||||
@param to_free The data struct that should be free'd
|
||||
@return 0 if it is free'd successfully
|
||||
*/
|
||||
int free_matrix(matrix * to_free);
|
||||
|
||||
/*
|
||||
A function that will free all the memory allocated for a model struct
|
||||
@param to_free The data struct that should be free'd
|
||||
@return 0 if it is free'd successfully
|
||||
*/
|
||||
int free_model(nn_model * to_free);
|
||||
|
||||
/*
|
||||
A function that allocates memory for a matrix
|
||||
@param r The number of rows
|
||||
@param c The number of columns
|
||||
*/
|
||||
matrix * create_matrix(int r, int c);
|
||||
|
||||
/*
|
||||
A function that creates a new neural network model.
|
||||
@param rate The network learning rate
|
||||
@param layers The number of layers of nodes (inclusive of input layer)
|
||||
@param layernodes An array of size layers, containing the number of nodes per layer
|
||||
@param outputs the number of nodes in the output layer
|
||||
@return A new network with random weights
|
||||
*/
|
||||
struct nn_model * create_model(const double rate, const int layers, const int layer_nodes[], const int outputs);
|
||||
|
||||
/*
|
||||
A method that creates a copy of a model
|
||||
@param model The model to be copied
|
||||
@return A copy of the model that was passed.
|
||||
*/
|
||||
struct nn_model * copy_model(const struct nn_model * model);
|
||||
|
||||
/*
|
||||
A function that writes a well performing and trained model to txt file
|
||||
@param m The model to be saved
|
||||
@param file The file name for the saved model
|
||||
@return 0 if it wrote correctly
|
||||
*/
|
||||
int save_model(const nn_model * m, const char * file);
|
||||
|
||||
/*
|
||||
A method that reads a saved model in from a file
|
||||
@param file The file name for the saved model
|
||||
@return A trained nn_model with the weights from the saved file.
|
||||
*/
|
||||
struct nn_model * load_model(char * file);
|
||||
|
||||
/*
|
||||
A function that takes two matricies and multiplies them together, returning a new matrix
|
||||
@param a Matrix 1 to be considered
|
||||
@param a Matrix 2 to be considered
|
||||
@result the product of matrix multiplication
|
||||
*/
|
||||
matrix * multiply_matricies(const matrix * a, const matrix * b);
|
||||
|
||||
/*
|
||||
Add the contents of matrix b to matrix a.
|
||||
@param a A matrix to be modified
|
||||
@param b A matrix of values to add to the first matrix
|
||||
@return 0 if successfully modified the first matrix
|
||||
*/
|
||||
int add_matricies(matrix * a, const matrix * b);
|
||||
/*
|
||||
A function that takes an input vector representing a layer of nodes and uses the Sigmoid
|
||||
activation functoin to create values that will be used for the next layer
|
||||
@param input_vector the input vector to be activated for the next later
|
||||
@return a matrix containing an 1xn vector of sigmoid values of the input vector
|
||||
*/
|
||||
matrix * activation_function(const matrix * input_vector);
|
||||
|
||||
/*
|
||||
Classify a provided message using the currently learned model.
|
||||
@param current The model to be used to classify
|
||||
@param input The input vector to be classified by the model
|
||||
@return 0 if vector has been classified
|
||||
*/
|
||||
int classify_instance(nn_model * current, message * input, const int size);
|
||||
|
||||
/*
|
||||
Free the memory used by layer value vectors.
|
||||
This function is for use after classifying.
|
||||
@param m The model whose input and output vectors should be freed
|
||||
@return 0 if the models vectors are reset for another run
|
||||
*/
|
||||
int reset_model_vectors(nn_model * m);
|
||||
|
||||
/*
|
||||
Propagate error back through the network
|
||||
@param model the model whose weights need to be updated
|
||||
@param outputs the desired outputs given the current values
|
||||
@return 0 if model weights are updated correctly
|
||||
*/
|
||||
int backprop_update(nn_model * update, matrix * output);
|
||||
|
||||
/*
|
||||
Train the network using backpropagation using the training_data and validating
|
||||
against the test_data
|
||||
@param m The model to traing
|
||||
@param training_data The dataset of training instances
|
||||
@param test_data The dataset of the test instances
|
||||
@param error_rate The minimum test data error rate accepted
|
||||
@param epoch_max The maximum number of times to continue training without finding a minimum
|
||||
@return 0 when an acceptible threshold of training has been reached.
|
||||
*/
|
||||
nn_model * train_model(nn_model * m, data * training_data, data * test_data, double error_rate, int epoch_max);
|
||||
|
||||
/*
|
||||
A function that classifies all the instances in the dataset
|
||||
@param m The model that is to be used for classification
|
||||
@param set The dataset whose instances are to be classified
|
||||
@return 0 If the set is successfully classified
|
||||
*/
|
||||
int classify_dataset(nn_model * m, data * set);
|
||||
|
||||
/*
|
||||
A function that prints a confusion matrix for a dataset of how many of each instance
|
||||
are classifed as what, as well as some stats
|
||||
@param set The dataset whose confusion matrix is to be printed
|
||||
*/
|
||||
void print_confusion_matrix(data * set);
|
||||
|
||||
/*
|
||||
A function that prints a matrix with optional column labels
|
||||
@param m The matrix that is to be printed
|
||||
@param labels The string labels to be output in the case that there are some
|
||||
*/
|
||||
void print_matrix(matrix * m, char ** labels);
|
||||
|
||||
#endif
|
507
process.c
Normal file
507
process.c
Normal file
@@ -0,0 +1,507 @@
|
||||
/*
|
||||
Author: James Griffin-Allwood
|
||||
Date: March 4 2014
|
||||
|
||||
Description: Implementations of reading in messages formatted
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include <math.h>
|
||||
#include <Block.h>
|
||||
#include <dispatch/dispatch.h>
|
||||
#include "process.h"
|
||||
|
||||
typedef struct dictionary_word {
|
||||
char * word;
|
||||
int count;
|
||||
int document_count;
|
||||
} dictionary_word;
|
||||
|
||||
int free_data(data * to_free) {
|
||||
if (to_free == NULL) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (to_free->instances != NULL) {
|
||||
free(to_free->instances);
|
||||
}
|
||||
|
||||
if (to_free->vector_terms != NULL){
|
||||
free(to_free->vector_terms);
|
||||
}
|
||||
|
||||
free(to_free);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct data * read_data(char * file) {
|
||||
FILE * temp;
|
||||
data * data_buffer;
|
||||
char * line;
|
||||
char * message_buffer;
|
||||
int class = -1;
|
||||
|
||||
char * pro = "<Pros>";
|
||||
char * pro_close = "</Pros>";
|
||||
char * con = "<Cons>";
|
||||
char * con_close = "</Cons>";
|
||||
char * unknown = "<Labs>";
|
||||
char * unknown_close = "</Labs>";
|
||||
|
||||
int lines = 0;
|
||||
int max_line_size = 0;
|
||||
int line_count = 0;
|
||||
char c;
|
||||
|
||||
if ((temp = fopen(file, "r")) == NULL) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
while ((c = fgetc(temp)) != EOF) {
|
||||
line_count++;
|
||||
if (c == '\n') {
|
||||
++lines;
|
||||
if (line_count > max_line_size)
|
||||
max_line_size = line_count;
|
||||
line_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
rewind(temp);
|
||||
|
||||
if ((data_buffer = calloc(1, sizeof(data))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if ((line = malloc(sizeof(char) * max_line_size)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be read\n");
|
||||
}
|
||||
|
||||
data_buffer->count = lines;
|
||||
|
||||
for (int i = 0; i < lines; i++) {
|
||||
if (fgets(line, max_line_size, temp) != NULL) {
|
||||
if (strstr(line, pro) != NULL) {
|
||||
char * start = strstr(line, pro) + (sizeof(char) * strlen(pro));
|
||||
char * end = strstr(line, pro_close);
|
||||
message_buffer = strndup(start, end - start);
|
||||
class = PRO;
|
||||
} else if (strstr(line, con) != NULL) {
|
||||
char * start = strstr(line, con) + (sizeof(char) * strlen(con));
|
||||
char * end = strstr(line, con_close);
|
||||
message_buffer = strndup(start, end - start);
|
||||
class = CON;
|
||||
} else if (strstr(line, unknown) != NULL) {
|
||||
char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown));
|
||||
char * end = strstr(line, unknown_close);
|
||||
message_buffer = strndup(start, end - start);
|
||||
class = UNKNOWN;
|
||||
}else {
|
||||
message_buffer = "";
|
||||
}
|
||||
|
||||
data_buffer->instances[i] = calloc(1, sizeof(message));
|
||||
|
||||
data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer));
|
||||
data_buffer->instances[i]->class = class;
|
||||
}
|
||||
}
|
||||
|
||||
free(line);
|
||||
|
||||
if (fclose(temp) == EOF) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return data_buffer;
|
||||
}
|
||||
|
||||
int weka_output(data * print_data, char * out_file) {
|
||||
FILE * out;
|
||||
|
||||
if ((out= fopen(out_file, "w")) == NULL) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(out, "@relation 'Pro/Con Message Classification'\n");
|
||||
fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n");
|
||||
|
||||
for (int i = 0; i < print_data->count; i++) {
|
||||
if (print_data->instances[i]->class == UNKNOWN) {
|
||||
fprintf(out, "'%s',?\n",
|
||||
escape_single_quote(print_data->instances[i]->text));
|
||||
} else {
|
||||
fprintf(out, "'%s',%d\n",
|
||||
escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class);
|
||||
}
|
||||
}
|
||||
|
||||
if (fclose(out) == EOF) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int csv_output(data * print_data, char * out_file) {
|
||||
FILE * out;
|
||||
char * pro = "Pro";
|
||||
char * con = "Con";
|
||||
|
||||
if ((out= fopen(out_file, "w")) == NULL) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < print_data->count; i++) {
|
||||
if (print_data->instances[i]->prediction == UNKNOWN) {
|
||||
fprintf(out, "?,'%s'\n",
|
||||
escape_single_quote(print_data->instances[i]->text));
|
||||
} else {
|
||||
if (print_data->instances[i]->prediction_probability[0]
|
||||
> print_data->instances[i]->prediction_probability[1]) {
|
||||
fprintf(out, "%s,'%s'\n", con,
|
||||
escape_single_quote(print_data->instances[i]->text));
|
||||
} else {
|
||||
fprintf(out, "%s,'%s'\n", pro,
|
||||
escape_single_quote(print_data->instances[i]->text));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fclose(out) == EOF) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char * escape_single_quote(const char *str) {
|
||||
char * ret, * r;
|
||||
const char * p, * q;
|
||||
size_t oldlen = strlen("'");
|
||||
size_t count, retlen, newlen = strlen("\\'");
|
||||
|
||||
for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){
|
||||
count++;
|
||||
}
|
||||
retlen = p - str + strlen(p) + count * (newlen - oldlen);
|
||||
|
||||
if ((ret = malloc(retlen + 1)) == NULL)
|
||||
return NULL;
|
||||
|
||||
for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) {
|
||||
ptrdiff_t l = q - p;
|
||||
memcpy(r, p, l);
|
||||
r += l;
|
||||
memcpy(r, "\\'", newlen);
|
||||
r += newlen;
|
||||
}
|
||||
strcpy(r, p);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
char ** load_stop_words(char * filename, int * word_count) {
|
||||
FILE * temp;
|
||||
char * line;
|
||||
int index = 0;
|
||||
char ** words;
|
||||
|
||||
* word_count = 0;
|
||||
|
||||
if ((temp = fopen(filename, "r")) == NULL) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for stop words to be read\n");
|
||||
return NULL;
|
||||
}
|
||||
while (fscanf(temp, "%s\n", line) == 1) {
|
||||
(* word_count)++;
|
||||
}
|
||||
|
||||
rewind(temp);
|
||||
|
||||
if ((words = calloc(* word_count, sizeof(char *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for stop words\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while (fscanf(temp, "%s", line) == 1) {
|
||||
words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH));
|
||||
index++;
|
||||
}
|
||||
|
||||
free(line);
|
||||
|
||||
if (fclose(temp) == EOF) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
int train_test_split(const data * dataset, const int percent, data * train, data * test) {
|
||||
int total_instances = dataset->count;
|
||||
double train_percent = (100 - percent) / 100.0;
|
||||
int train_instances = (int)(total_instances * train_percent);
|
||||
int test_instances = total_instances - train_instances;
|
||||
int train_index = 0;
|
||||
int test_indexes[test_instances];
|
||||
|
||||
if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < test_instances; i++) {
|
||||
int random_index = -1;
|
||||
int new_index = 0;
|
||||
|
||||
while (!new_index) {
|
||||
int is_found = 0;
|
||||
random_index = arc4random_uniform(total_instances);
|
||||
for (int j = 0; j < i; j++) {
|
||||
if (test_indexes[j] == random_index) {
|
||||
is_found = 1;
|
||||
}
|
||||
}
|
||||
if (!is_found) {
|
||||
new_index = 1;
|
||||
}
|
||||
}
|
||||
|
||||
test_indexes[i] = random_index;
|
||||
if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
test->instances[i]->text
|
||||
= strndup(dataset->instances[random_index]->text,
|
||||
strlen(dataset->instances[random_index]->text));
|
||||
|
||||
test->instances[i]->class = dataset->instances[random_index]->class;
|
||||
test->instances[i]->prediction = dataset->instances[random_index]->prediction;
|
||||
memcpy(test->instances[i]->text_vector,
|
||||
dataset->instances[random_index]->text_vector,
|
||||
sizeof(dataset->instances[random_index]->text_vector));
|
||||
}
|
||||
|
||||
test->count = test_instances;
|
||||
|
||||
for (int i = 0; i < total_instances; i++) {
|
||||
int is_test = 0;
|
||||
|
||||
for (int j = 0; j < test_instances; j++) {
|
||||
if (i == test_indexes[j]) {
|
||||
is_test = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_test) {
|
||||
if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
train->instances[train_index]->text
|
||||
= strndup(dataset->instances[i]->text,
|
||||
strlen(dataset->instances[i]->text));
|
||||
train->instances[train_index]->class = dataset->instances[i]->class;
|
||||
train->instances[train_index]->prediction = dataset->instances[i]->prediction;
|
||||
memcpy(train->instances[train_index]->text_vector,
|
||||
dataset->instances[i]->text_vector,
|
||||
sizeof(dataset->instances[i]->text_vector));
|
||||
|
||||
train_index++;
|
||||
}
|
||||
}
|
||||
|
||||
train->count = train_instances;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) {
|
||||
dictionary_word ** dictionary;
|
||||
int word_count = 0;
|
||||
int allocated = size;
|
||||
|
||||
// 0 out the vector counts for dataset
|
||||
for (int i = 0; i < size; i++) {
|
||||
dataset->vector_document_counts[i] = 0;
|
||||
}
|
||||
|
||||
if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for dictionary\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stdout,"%d instances considered for vector\n", dataset->count);
|
||||
|
||||
for (int i = 0; i < dataset->count; i++) {
|
||||
char * token, * string;
|
||||
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
|
||||
int * terms_per_document;
|
||||
if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL) {
|
||||
fprintf(stderr, "Unable to create vector\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (string == NULL) {
|
||||
fprintf(stderr, "Unable to parse message for terms\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ((token = strsep(&string, ".,?! ")) != NULL) {
|
||||
int word_found = 0;
|
||||
int is_stop_word = 0;
|
||||
|
||||
if (strcmp(token, "") != 0) {
|
||||
for (int j = 0; j < word_count; j++) {
|
||||
if (strcasecmp(token, dictionary[j]->word) == 0) {
|
||||
dictionary[j]->count++;
|
||||
terms_per_document[j]++;
|
||||
word_found = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (stop_words != NULL) {
|
||||
for (int j = 0; j < stop_word_count; j++) {
|
||||
if (strcasecmp(token, stop_words[j]) == 0) {
|
||||
is_stop_word = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!word_found && !is_stop_word) {
|
||||
word_count++;
|
||||
if (word_count > allocated) {
|
||||
dictionary_word ** more_words;
|
||||
if ((more_words = realloc(dictionary,
|
||||
sizeof(dictionary_word *) * (2 * word_count))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for dictionary\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
dictionary = more_words;
|
||||
allocated = 2 * word_count;
|
||||
|
||||
int * more_document_counts;
|
||||
if ((more_document_counts = realloc(terms_per_document,
|
||||
sizeof(int) * allocated)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for term frequencies\n");
|
||||
return 1;
|
||||
}
|
||||
terms_per_document = more_document_counts;
|
||||
}
|
||||
|
||||
if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
|
||||
return 1;
|
||||
}
|
||||
if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1);
|
||||
dictionary[word_count - 1]->count = 1;
|
||||
terms_per_document[word_count - 1] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < word_count; j++) {
|
||||
if (terms_per_document[j] > 0) {
|
||||
dictionary[j]->document_count++;
|
||||
}
|
||||
}
|
||||
|
||||
free(string);
|
||||
}
|
||||
|
||||
qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings);
|
||||
|
||||
if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for Model Vector\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (i < word_count) {
|
||||
dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word));
|
||||
dataset->vector_document_counts[i] = dictionary[i]->document_count;
|
||||
} else {
|
||||
dataset->vector_terms[i] = strdup("");
|
||||
dataset->vector_document_counts[i] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fprintf(stdout, "Found %d different words\n", word_count);
|
||||
|
||||
free(dictionary);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int compare_strings(const void * a, const void * b) {
|
||||
const dictionary_word * word_a = *(dictionary_word **) a;
|
||||
const dictionary_word * word_b = *(dictionary_word **) b;
|
||||
|
||||
return (word_b->count - word_a->count);
|
||||
}
|
||||
|
||||
int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) {
|
||||
// Use Grand Central Dispatch and Blocks to multithread this task for performance
|
||||
dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
|
||||
char * token, * string;
|
||||
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
|
||||
|
||||
if (string == NULL) {
|
||||
fprintf(stderr, "Unable to parse message for terms\n");
|
||||
}
|
||||
|
||||
for (int index = 0; index < size; index++) {
|
||||
dataset->instances[i]->text_vector[index] = 0;
|
||||
}
|
||||
|
||||
while ((token = strsep(&string, ".,?! ")) != NULL) {
|
||||
for (int index = 0; index < size; index++) {
|
||||
if (strcasecmp(token, vector_terms[index]) == 0) {
|
||||
dataset->instances[i]->text_vector[index]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int index = 0; index < size; index++) {
|
||||
double tf = dataset->instances[i]->text_vector[index];
|
||||
double docs = 1;
|
||||
if (vector_document_counts[i] != 0) {
|
||||
docs = vector_document_counts[i];
|
||||
}
|
||||
double idf = log(dataset->count/docs);
|
||||
double tfidf = tf * idf;
|
||||
|
||||
dataset->instances[i]->text_vector[index] = tfidf;
|
||||
}
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
121
process.h
Normal file
121
process.h
Normal file
@@ -0,0 +1,121 @@
|
||||
#ifndef PROCESS
|
||||
#define PROCESS
|
||||
|
||||
#define PRO 1
|
||||
#define CON 0
|
||||
#define PRO_CON_OUTPUT 2
|
||||
#define UNKNOWN 2
|
||||
#define VECTOR_SIZE 1000
|
||||
#define MAX_TERM_LENGTH 128
|
||||
|
||||
/*
|
||||
A struct that contains a text message and a PRO or CON classification
|
||||
*/
|
||||
typedef struct message {
|
||||
char * text;
|
||||
double text_vector[VECTOR_SIZE];
|
||||
int class;
|
||||
int prediction;
|
||||
double prediction_probability[PRO_CON_OUTPUT];
|
||||
} message;
|
||||
|
||||
/*
|
||||
A struct that contains all of the messages used for training for testing
|
||||
*/
|
||||
typedef struct data {
|
||||
message ** instances;
|
||||
char ** vector_terms;
|
||||
int vector_document_counts[VECTOR_SIZE];
|
||||
int count;
|
||||
} data;
|
||||
|
||||
/*
|
||||
A function that will free all the memory allocated for a data struct
|
||||
@param to_free The data struct that should be free'd
|
||||
@return 0 if it is free'd successfully
|
||||
*/
|
||||
int free_data(data * to_free);
|
||||
|
||||
/*
|
||||
A function that takes a file name return a data struct that contains the messages
|
||||
(and classifications if provided)
|
||||
@param file: The name of a file of data to be read into a data structure.
|
||||
@return A pointer to a struct containing an array of message structs and their
|
||||
classifications
|
||||
*/
|
||||
data * read_data(char * file);
|
||||
|
||||
/*
|
||||
Output data into a weka format
|
||||
@param print_data: The data struct to be printed
|
||||
@param out_file: The file where the weka arff should be written
|
||||
@return 0 if successfully output
|
||||
*/
|
||||
int weka_output(data * print_data, char * out_file);
|
||||
|
||||
/*
|
||||
Output data into a csv file with 1 instance per line
|
||||
@param print_data: The data struct to be printed
|
||||
@param out_file: The file where the weka arff should be written
|
||||
@return 0 if successfully output
|
||||
*/
|
||||
int csv_output(data * print_data, char * out_file);
|
||||
/*
|
||||
A function for escaping single quotes in a string.
|
||||
Based off generic code found http://creativeandcritical.net/str-replace-c/
|
||||
Modified to only escapse ''s
|
||||
@param str The string to escape
|
||||
@return An escaped string.
|
||||
*/
|
||||
char * escape_single_quote(const char *str);
|
||||
|
||||
/*
|
||||
A function that reads in a collection of stop words from a file with 1 word per line.
|
||||
@param filename The name of the file to be parsed
|
||||
@param word_count A pointer where the number of stop words should be stored
|
||||
@return the array of words
|
||||
*/
|
||||
char ** load_stop_words(char * filename, int * word_count);
|
||||
|
||||
/*
|
||||
A function that takes a dataset, a percentage that should be reserved for testing.
|
||||
The function requires 2 data pointers for storing the resulting train and test sets
|
||||
@param dataset The data to be split
|
||||
@param percent The percent of the data to be used for TESTING
|
||||
@param train The data that will be used for training
|
||||
@param test The data that will be used for testing
|
||||
@return 0 if the new datasets are created without issue. non zero otherwise.
|
||||
*/
|
||||
int train_test_split(const data * dataset, const int percent, data * train, data * test);
|
||||
|
||||
/*
|
||||
A function that parses through all the supplied messages in a data struct
|
||||
and determines the most relevant terms in the training data. These will be
|
||||
used in the construction of a vector representation of any specfic message
|
||||
@param dataset The collection of messages to be considered
|
||||
@param stop_words an array of stopwords to ignore
|
||||
@param stop_word_count the number of stop words.
|
||||
@return 0 if the vector representation is found and the vectors created for
|
||||
all messages
|
||||
*/
|
||||
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size);
|
||||
|
||||
/*
|
||||
A simple function that compares word counts based on an array of words
|
||||
and a second array of the counts.
|
||||
@param a word 1
|
||||
@param b word 2
|
||||
@return -1, 0, 1 if a is less than, equal to, or greater than b
|
||||
*/
|
||||
int compare_strings(const void * a, const void * b);
|
||||
|
||||
/*
|
||||
A function that parses through all the supplied messages in a data struct
|
||||
and determines the vector representation based on the supplied vector terms
|
||||
@param dataset The collection of messages to be considered
|
||||
@param vector_terms The array of terms that make up the vector
|
||||
@return 0 if the vectors created for all messages
|
||||
*/
|
||||
int vector_representation(data * dataset, char ** vector_terms, int * vector_document_counts, const int size);
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user