/* Author: James Griffin-Allwood Date: March 4 2014 Description: */ #include #include #include #include #include "process.h" #include "nn.h" #define MAX_PARAM_SIZE 255 #define MAX_COMMAND_SIZE 10 #define DEFAULT_TRAIN_TEST_SPLIT 25 /* A function that prints usage information for this application */ void usage(char * app); /* A function that prints command information */ void print_help_text(); int main(int argc, char * argv[]) { char * train; char * classify; char * stop; char ** stop_word_array; int prompt = 0, train_file = 0, classify_file = 0, stop_words = 0; int stop_word_count = 0; int train_test_split_percent = DEFAULT_TRAIN_TEST_SPLIT; int network_layers = DEFAULT_LAYERS; int input_vector_size = VECTOR_SIZE; double learning_rate = DEFAULT_LEARNING_RATE; double desired_test_error_min = DEFAULT_TEST_ERROR_MIN; int epochs = DEFAULT_MAX_EPOCHS_SINCE_MIN; int hidden_nodes_per_layer = DEFAULT_HIDDEN_NODES_PER_LAYER; data * to_train = NULL, * to_classify = NULL, * train_set = NULL, * test_set = NULL; nn_model * classifier = NULL; // Process cli arguments and determine if there were flags. // If no flags and just file names load data and write weka files if (argc == 1) { prompt = 1; } else if (argc > 1) { if (argv[1][0] == '-') { int params = strlen(argv[1]); prompt = 1; for (int i = 1; i < params; i++) { switch (argv[1][i]) { case 't': train_file = 1; break; case 'c': classify_file = 1; break; case 's': stop_words = 1; break; default: break; } } } else { if (argc == 3) { classify = argv[2]; to_classify = read_data(classify); if (weka_output(to_classify, "procon_classify.arff") != 0) { fprintf(stderr, "Unable to write weka_formatted file procon_classify.arff\n"); } free_data(to_classify); } train = argv[1]; to_train = read_data(train); if (weka_output(to_train, "procon_train.arff") != 0) { fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); } free_data(to_train); return EXIT_SUCCESS; } } if ((train_set = calloc(1, sizeof(data))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } if ((test_set = calloc(1, sizeof(data))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } // If flags are set for commandline training or test data get file names and read data if (train_file) { if (!classify_file) { train = argv[argc - 1]; } else { train = argv[argc - 2]; } to_train = read_data(train); fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count); } if (classify_file) { classify = argv[argc - 1]; to_classify = read_data(classify); fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count); } if (stop_words) { if (!classify_file) { if (!train_file) { stop = argv[argc - 1]; } else { stop = argv[argc - 2]; } } else { if (!train_file) { stop = argv[argc - 2]; } else { stop = argv[argc - 3]; } } stop_word_array = load_stop_words(stop, &stop_word_count); fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count); } /* Begin terminal interface. Exit on "exit". This interface allows you to load, specify learning parameters, learn, classify, write output data. */ fprintf(stdout, "Pro/Con Learner.\nEnter commands below. (type 'help' for commands)\n"); char * command, * p1, * p2; if ((command = malloc(sizeof(char) * MAX_COMMAND_SIZE)) == NULL) { fprintf(stderr, "Unable to allocate memory for command"); } if ((p1 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) { fprintf(stderr, "Unable to allocate memory for parameters"); } if ((p2 = malloc(sizeof(char) * MAX_PARAM_SIZE)) == NULL) { fprintf(stderr, "Unable to allocate memory for parameters"); } while (prompt) { fprintf(stdout, "> "); fscanf(stdin, "%s", command); if (strncmp(command, "exit", MAX_COMMAND_SIZE) == 0) { fprintf(stdout, "Quitting...\n"); prompt = 0; } else if (strncmp(command, "help", MAX_COMMAND_SIZE) == 0) { print_help_text(); } else if (strncmp(command, "weka", MAX_COMMAND_SIZE) == 0) { if (train_file) { if (weka_output(to_train, "procon_train.arff") != 0) { fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); } else { fprintf(stdout, "Wrote training data to weka format.\n"); } } if (classify_file) { if (weka_output(to_classify, "procon_test.arff") != 0) { fprintf(stderr, "Unable to write weka_formatted file procon_test.arff\n"); } else { fprintf(stdout, "Wrote test data to weka format.\n"); } } } else if (strncmp(command, "load", MAX_COMMAND_SIZE) == 0) { if (fscanf(stdin, "%s %s", p1, p2) == 2) { if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) { free_data(to_train); to_train = read_data(p2); fprintf(stdout, "Read in training set specified (%d instances)\n", to_train->count); train_file = 1; } else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) { free_data(to_classify); to_classify = read_data(p2); fprintf(stdout, "Read in test set specified (%d instances)\n", to_classify->count); classify_file = 1; } else if (strncmp(p1, "stop", MAX_COMMAND_SIZE) == 0) { free(stop_word_array); stop_word_array = load_stop_words(p2, &stop_word_count); fprintf(stdout, "Read in stop words (%d words)\n", stop_word_count); stop_words = 1; } else if (strncmp(p1, "model", MAX_COMMAND_SIZE) == 0) { if (classifier != NULL) { free_model(classifier); } classifier = load_model(p2); if (classifier != NULL) { fprintf(stdout, "Loaded model from %s\n", p2); } } } else { fprintf(stderr, "load must specify data type and a file name.\n"); } } else if (strncmp(command, "set", MAX_COMMAND_SIZE) == 0) { if (fscanf(stdin, "%s %s", p1, p2) == 2) { double p2_value = 0; if (sscanf(p2, "%lf", &p2_value) == 1) { if (strncmp(p1, "split", MAX_COMMAND_SIZE) == 0) { train_test_split_percent = (int)p2_value; } else if (strncmp(p1, "lrate", MAX_COMMAND_SIZE) == 0) { learning_rate = p2_value; fprintf(stdout,"The Learning Rate is now set to %lf.\n", learning_rate); } else if (strncmp(p1, "hnodes", MAX_COMMAND_SIZE) == 0) { hidden_nodes_per_layer = p2_value; fprintf(stdout,"There will be %d nodes in each hidden layer.\n", hidden_nodes_per_layer); } else if (strncmp(p1, "vector", MAX_COMMAND_SIZE) == 0) { input_vector_size = p2_value; fprintf(stdout,"The vector representation is now %d.\n", input_vector_size); } else if (strncmp(p1, "hlayers", MAX_COMMAND_SIZE) == 0) { network_layers = p2_value + 1; fprintf(stdout,"There will be %d hidden layers.\n", (network_layers - 1)); } else if (strncmp(p1, "epochs", MAX_COMMAND_SIZE) == 0) { epochs = p2_value; fprintf(stdout,"The maximum number of epochs is now %d.\n", epochs); } } else { fprintf(stderr, "You must provide a valid integer value for set.\n"); } } else { fprintf(stderr, "set must specify paramter and a value.\n"); } } else if (strncmp(command, "learn", MAX_COMMAND_SIZE) == 0) { if (classifier != NULL) { free_model(classifier); } if (!stop_words) { stop_word_array = NULL; } // Using the selected Train Data, determine the terms to be used for the vector create_vector_represntation(to_train, stop_word_array, stop_word_count, input_vector_size); // Create Random Split if (train_test_split(to_train, train_test_split_percent, train_set, test_set) != 0) { fprintf(stderr, "Unable to create training and test sets.\n"); return 1; } // Create the vector representations of the training set, and test set vector_representation(train_set, to_train->vector_terms, to_train->vector_document_counts, input_vector_size); vector_representation(test_set, to_train->vector_terms, to_train->vector_document_counts, input_vector_size); int nodes[network_layers]; for (int i = 0; i < network_layers; i++) { if (i == 0) { nodes[i] = input_vector_size; } else { nodes[i] = hidden_nodes_per_layer; } } classifier = create_model(learning_rate, network_layers, nodes, PRO_CON_OUTPUT); classifier = train_model(classifier, train_set, test_set, desired_test_error_min, epochs); classify_dataset(classifier, train_set); classify_dataset(classifier, test_set); fprintf(stdout, "\nModel Performances on the training set\n"); print_confusion_matrix(train_set); fprintf(stdout, "\nModel Performances on the test set\n"); print_confusion_matrix(test_set); } else if (strncmp(command, "classify", MAX_COMMAND_SIZE) == 0) { if (classify_file == 0) { fprintf(stdout, "Please load a the dataset that should be classified\n"); } else { if (classifier == NULL) { fprintf(stdout, "Please train the model to classify first\n"); } else { vector_representation(to_classify, to_train->vector_terms, to_train->vector_document_counts, input_vector_size); classify_dataset(classifier, to_classify); } } } else if (strncmp(command, "csv", MAX_COMMAND_SIZE) == 0) { if (fscanf(stdin, "%s", p1) == 1) { if (strncmp(p1, "train", MAX_COMMAND_SIZE) == 0) { char * train_csv = "training.csv"; csv_output(to_train, train_csv); fprintf(stdout, "Wrote training set to csv (%s)\n", train_csv); } else if (strncmp(p1, "classify", MAX_COMMAND_SIZE) == 0) { char * classify_csv = "classify.csv"; csv_output(to_classify, classify_csv); fprintf(stdout, "Wrote classify set to csv (%s)\n", classify_csv); } } else { fprintf(stderr, "csv must specify dataset to output.\n"); } } else if (strncmp(command, "save", MAX_COMMAND_SIZE) == 0) { if (fscanf(stdin, "%s", p1) == 1) { if (save_model(classifier, p1) == 0) { fprintf(stdout, "Saved mode to %s\n", p1); } } else { fprintf(stderr, "save needs a filename to save the model to.\n"); } } } free(command); free(p1); free(p2); if (stop_words) { free(stop_word_array); }; if (classifier != NULL) { free_model(classifier); } if (to_train != NULL) { free_data(to_train); } if (to_classify != NULL) { free_data(to_classify); } return EXIT_SUCCESS; } void usage(char * app) { fprintf(stderr, "Usage: %s [-tT] [] []\n", app); exit(EXIT_FAILURE); } void print_help_text() { fprintf(stdout, "Available commands are:\n"); fprintf(stdout, "weka - no arguments, will write out training and classification data to .arff files\n"); fprintf(stdout, "csv - will write out data to .csv files\n"); fprintf(stdout, "load - load training data, classify data\n"); fprintf(stdout, "set - set a value for any of the listed rates\n"); fprintf(stdout, "learn - using the loaded training data, the set split and learning rate create a model\n"); fprintf(stdout, "classify - using the learned model classify the loaded classify data\n"); fprintf(stdout, "exit - quit the program\n"); }