initial github commit
Implementation of backprop in C using Grand Central Dispatch and Blocks
This commit is contained in:
358
main.c
Normal file
358
main.c
Normal file
@@ -0,0 +1,358 @@
|
||||
/*
|
||||
Author: James Griffin-Allwood
|
||||
Date: March 4 2014
|
||||
|
||||
Description:
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <dispatch/dispatch.h>
|
||||
#include "process.h"
|
||||
#include "nn.h"
|
||||
|
||||
#define MAX_PARAM_SIZE 255
|
||||
#define MAX_COMMAND_SIZE 10
|
||||
#define DEFAULT_TRAIN_TEST_SPLIT 25
|
||||
|
||||
/*
|
||||
A function that prints usage information for this application
|
||||
*/
|
||||
void usage(char * app);
|
||||
|
||||
/*
|
||||
A function that prints command information
|
||||
*/
|
||||
void print_help_text();
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
char * train;
|
||||
char * classify;
|
||||
char * stop;
|
||||
char ** stop_word_array;
|
||||
|
||||
int prompt = 0, train_file = 0, classify_file = 0, stop_words = 0;
|
||||
int stop_word_count = 0;
|
||||
int train_test_split_percent = DEFAULT_TRAIN_TEST_SPLIT;
|
||||
int network_layers = DEFAULT_LAYERS;
|
||||
int input_vector_size = VECTOR_SIZE;
|
||||
double learning_rate = DEFAULT_LEARNING_RATE;
|
||||
double desired_test_error_min = DEFAULT_TEST_ERROR_MIN;
|
||||
int epochs = DEFAULT_MAX_EPOCHS_SINCE_MIN;
|
||||
int hidden_nodes_per_layer = DEFAULT_HIDDEN_NODES_PER_LAYER;
|
||||
data * to_train = NULL, * to_classify = NULL, * train_set = NULL, * test_set = NULL;
|
||||
nn_model * classifier = NULL;
|
||||
|
||||
// Process cli arguments and determine if there were flags.
|
||||
// If no flags and just file names load data and write weka files
|
||||
if (argc == 1) {
|
||||
prompt = 1;
|
||||
} else if (argc > 1) {
|
||||
if (argv[1][0] == '-') {
|
||||
int params = strlen(argv[1]);
|
||||
prompt = 1;
|
||||
|
||||
for (int i = 1; i < params; i++) {
|
||||
switch (argv[1][i]) {
|
||||
case 't':
|
||||
train_file = 1;
|
||||
break;
|
||||
case 'c':
|
||||
classify_file = 1;
|
||||
break;
|
||||
case 's':
|
||||
stop_words = 1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (argc == 3) {
|
||||
classify = argv[2];
|
||||
to_classify = read_data(classify);
|
||||
if (weka_output(to_classify, "procon_classify.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_classify.arff\n");
|
||||
}
|
||||
free_data(to_classify);
|
||||
}
|
||||
train = argv[1];
|
||||
to_train = read_data(train);
|
||||
if (weka_output(to_train, "procon_train.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
}
|
||||
free_data(to_train);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
if ((train_set = calloc(1, sizeof(data))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((test_set = calloc(1, sizeof(data))) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If flags are set for commandline training or test data get file names and read data
|
||||
if (train_file) {
|
||||
if (!classify_file) {
|
||||
train = argv[argc - 1];
|
||||
} else {
|
||||
train = argv[argc - 2];
|
||||
}
|
||||
to_train = read_data(train);
|
||||
fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count);
|
||||
}
|
||||
|
||||
if (classify_file) {
|
||||
classify = argv[argc - 1];
|
||||
to_classify = read_data(classify);
|
||||
fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count);
|
||||
}
|
||||
|
||||
if (stop_words) {
|
||||
if (!classify_file) {
|
||||
if (!train_file) {
|
||||
stop = argv[argc - 1];
|
||||
} else {
|
||||
stop = argv[argc - 2];
|
||||
}
|
||||
} else {
|
||||
if (!train_file) {
|
||||
stop = argv[argc - 2];
|
||||
} else {
|
||||
stop = argv[argc - 3];
|
||||
}
|
||||
}
|
||||
|
||||
stop_word_array = load_stop_words(stop, &stop_word_count);
|
||||
fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count);
|
||||
}
|
||||
|
||||
/*
|
||||
Begin terminal interface. Exit on "exit". This interface allows you to load,
|
||||
specify learning parameters, learn, classify, write output data.
|
||||
*/
|
||||
fprintf(stdout, "Pro/Con Learner.\nEnter commands below. (type 'help' for commands)\n");
|
||||
char * command, * p1, * p2;
|
||||
|
||||
if ((command = malloc(sizeof(char) * MAX_COMMAND_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for command");
|
||||
}
|
||||
|
||||
if ((p1 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for parameters");
|
||||
}
|
||||
|
||||
if ((p2 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) {
|
||||
fprintf(stderr, "Unable to allocate memory for parameters");
|
||||
}
|
||||
|
||||
while (prompt) {
|
||||
fprintf(stdout, "> ");
|
||||
|
||||
fscanf(stdin, "%s", command);
|
||||
|
||||
if (strncmp(command, "exit", MAX_COMMAND_SIZE) == 0) {
|
||||
fprintf(stdout, "Quitting...\n");
|
||||
prompt = 0;
|
||||
} else if (strncmp(command, "help", MAX_COMMAND_SIZE) == 0) {
|
||||
print_help_text();
|
||||
} else if (strncmp(command, "weka", MAX_COMMAND_SIZE) == 0) {
|
||||
if (train_file) {
|
||||
if (weka_output(to_train, "procon_train.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
} else {
|
||||
fprintf(stdout, "Wrote training data to weka format.\n");
|
||||
}
|
||||
}
|
||||
if (classify_file) {
|
||||
if (weka_output(to_classify, "procon_test.arff") != 0) {
|
||||
fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n");
|
||||
} else {
|
||||
fprintf(stdout, "Wrote test data to weka format.\n");
|
||||
}
|
||||
}
|
||||
} else if (strncmp(command, "load", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s %s", p1, p2) == 2) {
|
||||
if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) {
|
||||
free_data(to_train);
|
||||
to_train = read_data(p2);
|
||||
fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count);
|
||||
train_file = 1;
|
||||
} else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
free_data(to_classify);
|
||||
to_classify = read_data(p2);
|
||||
fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count);
|
||||
classify_file = 1;
|
||||
} else if (strncmp(p1, "stop", MAX_COMMAND_SIZE) == 0) {
|
||||
free(stop_word_array);
|
||||
stop_word_array = load_stop_words(p2, &stop_word_count);
|
||||
fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count);
|
||||
stop_words = 1;
|
||||
} else if (strncmp(p1, "model", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
classifier = load_model(p2);
|
||||
if (classifier != NULL) {
|
||||
fprintf(stdout, "Loaded model from %s\n", p2);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "load must specify data type and a file name.\n");
|
||||
}
|
||||
} else if (strncmp(command, "set", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s %s", p1, p2) == 2) {
|
||||
double p2_value = 0;
|
||||
if (sscanf(p2, "%lf", &p2_value) == 1) {
|
||||
if (strncmp(p1, "split", MAX_COMMAND_SIZE) == 0) {
|
||||
train_test_split_percent = (int)p2_value;
|
||||
} else if (strncmp(p1, "lrate", MAX_COMMAND_SIZE) == 0) {
|
||||
learning_rate = p2_value;
|
||||
fprintf(stdout,"The Learning Rate is now set to %lf.\n", learning_rate);
|
||||
} else if (strncmp(p1, "hnodes", MAX_COMMAND_SIZE) == 0) {
|
||||
hidden_nodes_per_layer = p2_value;
|
||||
fprintf(stdout,"There will be %d nodes in each hidden layer.\n", hidden_nodes_per_layer);
|
||||
} else if (strncmp(p1, "vector", MAX_COMMAND_SIZE) == 0) {
|
||||
input_vector_size = p2_value;
|
||||
fprintf(stdout,"The vector representation is now %d.\n", input_vector_size);
|
||||
} else if (strncmp(p1, "hlayers", MAX_COMMAND_SIZE) == 0) {
|
||||
network_layers = p2_value + 1;
|
||||
fprintf(stdout,"There will be %d hidden layers.\n", (network_layers - 1));
|
||||
} else if (strncmp(p1, "epochs", MAX_COMMAND_SIZE) == 0) {
|
||||
epochs = p2_value;
|
||||
fprintf(stdout,"The maximum number of epochs is now %d.\n", epochs);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "You must provide a valid integer value for set.\n");
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "set must specify paramter and a value.\n");
|
||||
}
|
||||
} else if (strncmp(command, "learn", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
if (!stop_words) {
|
||||
stop_word_array = NULL;
|
||||
}
|
||||
|
||||
// Using the selected Train Data, determine the terms to be used for the vector
|
||||
create_vector_represntation(to_train, stop_word_array, stop_word_count, input_vector_size);
|
||||
|
||||
// Create Random Split
|
||||
if (train_test_split(to_train, train_test_split_percent, train_set, test_set) != 0) {
|
||||
fprintf(stderr, "Unable to create training and test sets.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Create the vector representations of the training set, and test set
|
||||
vector_representation(train_set, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
|
||||
vector_representation(test_set, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
|
||||
int nodes[network_layers];
|
||||
|
||||
for (int i = 0; i < network_layers; i++) {
|
||||
if (i == 0) {
|
||||
nodes[i] = input_vector_size;
|
||||
} else {
|
||||
nodes[i] = hidden_nodes_per_layer;
|
||||
}
|
||||
}
|
||||
|
||||
classifier = create_model(learning_rate, network_layers, nodes, PRO_CON_OUTPUT);
|
||||
|
||||
classifier = train_model(classifier, train_set, test_set, desired_test_error_min, epochs);
|
||||
|
||||
classify_dataset(classifier, train_set);
|
||||
classify_dataset(classifier, test_set);
|
||||
|
||||
fprintf(stdout, "\nModel Performances on the training set\n");
|
||||
print_confusion_matrix(train_set);
|
||||
fprintf(stdout, "\nModel Performances on the test set\n");
|
||||
print_confusion_matrix(test_set);
|
||||
} else if (strncmp(command, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
if (classify_file == 0) {
|
||||
fprintf(stdout, "Please load a the dataset that should be classified\n");
|
||||
} else {
|
||||
if (classifier == NULL) {
|
||||
fprintf(stdout, "Please train the model to classify first\n");
|
||||
} else {
|
||||
vector_representation(to_classify, to_train->vector_terms,
|
||||
to_train->vector_document_counts, input_vector_size);
|
||||
classify_dataset(classifier, to_classify);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (strncmp(command, "csv", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s", p1) == 1) {
|
||||
if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) {
|
||||
char * train_csv = "training.csv";
|
||||
csv_output(to_train, train_csv);
|
||||
fprintf(stdout, "Wrote training set to csv (%s)\n", train_csv);
|
||||
} else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) {
|
||||
char * classify_csv = "classify.csv";
|
||||
csv_output(to_classify, classify_csv);
|
||||
fprintf(stdout, "Wrote classify set to csv (%s)\n", classify_csv);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "csv must specify dataset to output.\n");
|
||||
}
|
||||
} else if (strncmp(command, "save", MAX_COMMAND_SIZE) == 0) {
|
||||
if (fscanf(stdin, "%s", p1) == 1) {
|
||||
if (save_model(classifier, p1) == 0) {
|
||||
fprintf(stdout, "Saved mode to %s\n", p1);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "save needs a filename to save the model to.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(command);
|
||||
free(p1);
|
||||
free(p2);
|
||||
|
||||
if (stop_words) {
|
||||
free(stop_word_array);
|
||||
};
|
||||
|
||||
if (classifier != NULL) {
|
||||
free_model(classifier);
|
||||
}
|
||||
|
||||
if (to_train != NULL) {
|
||||
free_data(to_train);
|
||||
}
|
||||
|
||||
if (to_classify != NULL) {
|
||||
free_data(to_classify);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
void usage(char * app) {
|
||||
fprintf(stderr, "Usage: %s [-tT] [<training data>] [<test data>]\n", app);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
void print_help_text() {
|
||||
fprintf(stdout, "Available commands are:\n");
|
||||
fprintf(stdout, "weka - no arguments, will write out training and classification data to .arff files\n");
|
||||
fprintf(stdout, "csv <train|classify> - will write out data to .csv files\n");
|
||||
fprintf(stdout, "load <train|classify|stop> <path to file> - load training data, classify data\n");
|
||||
fprintf(stdout, "set <split|lrate|hnodes|hlayers|vector|epochs> <number> - set a value for any of the listed rates\n");
|
||||
fprintf(stdout, "learn - using the loaded training data, the set split and learning rate create a model\n");
|
||||
fprintf(stdout, "classify - using the learned model classify the loaded classify data\n");
|
||||
fprintf(stdout, "exit - quit the program\n");
|
||||
}
|
Reference in New Issue
Block a user