initial github commit

Implementation of backprop in C using Grand Central Dispatch and Blocks
2014-08-06 15:12:09 -03:00
commit 6f23634e32
6 changed files with 1926 additions and 0 deletions
--- a/process.c
+++ b/process.c
@@ -0,0 +1,507 @@
+/*
+	Author: James Griffin-Allwood
+	Date: March 4 2014
+
+	Description: Implementations of reading in messages formatted 
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <math.h>
+#include <Block.h>
+#include <dispatch/dispatch.h>
+#include "process.h"
+
+typedef struct dictionary_word {
+	char * word;
+	int count;
+	int document_count;
+} dictionary_word;
+
+int free_data(data * to_free) {
+	if (to_free == NULL) {
+		return 1;
+	}
+	
+	if (to_free->instances != NULL) {
+		free(to_free->instances);
+	}
+	
+	if (to_free->vector_terms != NULL){
+		free(to_free->vector_terms);
+	}
+
+	free(to_free);
+	
+	return 0;
+}
+
+struct data * read_data(char * file) {
+	FILE * temp;
+	data * data_buffer;
+	char * line;
+	char * message_buffer;
+	int class = -1;
+
+	char * pro = "<Pros>";
+	char * pro_close = "</Pros>";
+	char * con = "<Cons>";
+	char * con_close = "</Cons>";
+	char * unknown = "<Labs>";
+	char * unknown_close = "</Labs>";
+		
+	int lines = 0;
+	int max_line_size = 0;
+	int line_count = 0;
+	char c;
+	
+	if ((temp = fopen(file, "r")) == NULL) {
+		exit(EXIT_FAILURE);
+	}
+	
+    while ((c = fgetc(temp)) != EOF) {
+		line_count++;
+		if (c == '\n') {
+			++lines;
+			if (line_count > max_line_size)
+				max_line_size = line_count;
+			line_count = 0;
+		}
+	}
+	
+	rewind(temp);
+	
+	if ((data_buffer = calloc(1, sizeof(data))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+		exit(EXIT_FAILURE);
+	}
+	
+	if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+		exit(EXIT_FAILURE);
+	}
+	
+	if ((line = malloc(sizeof(char) * max_line_size)) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for messages to be read\n"); 
+	}
+	
+	data_buffer->count = lines;
+
+	for (int i = 0; i < lines; i++) {
+		if (fgets(line, max_line_size, temp) != NULL) {
+			if (strstr(line, pro) != NULL) {
+				char * start = strstr(line, pro) + (sizeof(char) * strlen(pro));
+				char * end = strstr(line, pro_close);
+				message_buffer = strndup(start, end - start);
+				class = PRO;
+			} else if (strstr(line, con) != NULL) {
+				char * start = strstr(line, con) + (sizeof(char) * strlen(con));
+				char * end = strstr(line, con_close);
+				message_buffer = strndup(start, end - start);
+				class = CON;
+			} else if (strstr(line, unknown) != NULL) {
+				char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown));
+				char * end = strstr(line, unknown_close);
+				message_buffer = strndup(start, end - start);
+				class = UNKNOWN;
+			}else {
+				message_buffer = "";
+			}
+			
+			data_buffer->instances[i] = calloc(1, sizeof(message));
+			
+			data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer));
+			data_buffer->instances[i]->class = class;
+		}
+	}
+	
+	free(line);
+	
+	if (fclose(temp) == EOF) {
+		exit(EXIT_FAILURE);
+	}
+	
+	return data_buffer;
+}
+
+int weka_output(data * print_data, char * out_file) {
+	FILE * out;
+	
+	if ((out= fopen(out_file, "w")) == NULL) {
+		return 1;
+	}
+	
+	fprintf(out, "@relation 'Pro/Con Message Classification'\n");
+	fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n");
+	
+	for (int i = 0; i < print_data->count; i++) {
+		if (print_data->instances[i]->class == UNKNOWN) {
+			fprintf(out, "'%s',?\n", 
+				escape_single_quote(print_data->instances[i]->text));
+		} else {
+			fprintf(out, "'%s',%d\n", 
+				escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class);
+		}
+	}
+	
+	if (fclose(out) == EOF) {
+		return 1;
+	}
+	
+	return 0;
+}
+
+int csv_output(data * print_data, char * out_file) {
+	FILE * out;
+	char * pro = "Pro";
+	char * con = "Con";
+	
+	if ((out= fopen(out_file, "w")) == NULL) {
+		return 1;
+	}
+	
+	for (int i = 0; i < print_data->count; i++) {
+		if (print_data->instances[i]->prediction == UNKNOWN) {
+			fprintf(out, "?,'%s'\n", 
+				escape_single_quote(print_data->instances[i]->text));
+		} else {
+			if (print_data->instances[i]->prediction_probability[0] 
+				> print_data->instances[i]->prediction_probability[1]) {
+				fprintf(out, "%s,'%s'\n", con, 
+					escape_single_quote(print_data->instances[i]->text));
+			} else {
+				fprintf(out, "%s,'%s'\n", pro, 
+					escape_single_quote(print_data->instances[i]->text));
+			}
+		}
+	}
+	
+	if (fclose(out) == EOF) {
+		return 1;
+	}
+	
+	return 0;
+}
+
+char * escape_single_quote(const char *str) {
+	char * ret, * r;
+	const char * p, * q;
+	size_t oldlen = strlen("'");
+	size_t count, retlen, newlen = strlen("\\'");
+
+	for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){
+		count++;
+	}
+	retlen = p - str + strlen(p) + count * (newlen - oldlen);
+
+	if ((ret = malloc(retlen + 1)) == NULL)
+		return NULL;
+
+	for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) {
+		ptrdiff_t l = q - p;
+		memcpy(r, p, l);
+		r += l;
+		memcpy(r, "\\'", newlen);
+		r += newlen;
+	}
+	strcpy(r, p);
+
+	return ret;
+}
+
+char ** load_stop_words(char * filename, int * word_count) {
+	FILE * temp;
+	char * line;
+	int index = 0;
+	char ** words;
+	
+	* word_count = 0;
+	
+	if ((temp = fopen(filename, "r")) == NULL) {
+		exit(EXIT_FAILURE);
+	}
+	
+	if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for stop words to be read\n");
+		return NULL;
+	}
+	while (fscanf(temp, "%s\n", line) == 1) {
+		(* word_count)++;
+	}
+	
+	rewind(temp);
+	
+	if ((words = calloc(* word_count, sizeof(char *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for stop words\n");
+		return NULL;
+	}
+	
+	while (fscanf(temp, "%s", line) == 1) {
+		words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH));
+		index++;
+	}
+	
+	free(line);
+	
+	if (fclose(temp) == EOF) {
+		exit(EXIT_FAILURE);
+	}
+	
+	return words;
+}
+
+int train_test_split(const data * dataset, const int percent, data * train, data * test) {
+	int total_instances = dataset->count;
+	double train_percent = (100 - percent) / 100.0;
+	int train_instances = (int)(total_instances * train_percent);
+	int test_instances = total_instances - train_instances;
+	int train_index = 0;
+	int test_indexes[test_instances];
+	
+	if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+		return 1;
+	}
+	
+	if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+		return 1;
+	}
+	
+	for (int i = 0; i < test_instances; i++) {
+		int random_index = -1;
+		int new_index = 0;
+	
+		while (!new_index) {
+			int is_found = 0;
+			random_index = arc4random_uniform(total_instances);
+			for (int j = 0; j < i; j++) {
+				if (test_indexes[j] == random_index) {
+					is_found = 1;
+				}
+			}
+			if (!is_found) {
+				new_index = 1;
+			}
+		}
+		
+		test_indexes[i] = random_index;
+		if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) {
+			fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+			return 1;
+		}
+		test->instances[i]->text 
+			= strndup(dataset->instances[random_index]->text, 
+					strlen(dataset->instances[random_index]->text));
+
+		test->instances[i]->class = dataset->instances[random_index]->class;
+		test->instances[i]->prediction = dataset->instances[random_index]->prediction;
+		memcpy(test->instances[i]->text_vector, 
+				dataset->instances[random_index]->text_vector,
+				sizeof(dataset->instances[random_index]->text_vector));
+	}
+	
+	test->count = test_instances;
+	
+	for (int i = 0; i < total_instances; i++) {
+		int is_test = 0;
+		
+		for (int j = 0; j < test_instances; j++) {
+			if (i == test_indexes[j]) {
+				is_test = 1;
+			}
+		}
+		
+		if (!is_test) {
+			if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) {
+				fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
+				return 1;
+			}
+			train->instances[train_index]->text 
+				= strndup(dataset->instances[i]->text, 
+						strlen(dataset->instances[i]->text));
+			train->instances[train_index]->class = dataset->instances[i]->class;
+			train->instances[train_index]->prediction = dataset->instances[i]->prediction;
+			memcpy(train->instances[train_index]->text_vector, 
+					dataset->instances[i]->text_vector,
+					sizeof(dataset->instances[i]->text_vector));
+					
+			train_index++;
+		}
+	}
+	
+	train->count = train_instances;
+	
+	return 0;
+}
+
+int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) {
+	dictionary_word ** dictionary;
+	int word_count = 0;
+	int allocated = size;
+	
+	// 0 out the vector counts for dataset
+	for (int i = 0; i < size; i++) {
+		dataset->vector_document_counts[i] = 0;
+	}
+	
+	if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for dictionary\n");
+		return 1;
+	}
+	
+	fprintf(stdout,"%d instances considered for vector\n", dataset->count);
+	
+	for (int i = 0; i < dataset->count; i++) {
+		char * token, * string;
+		string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
+		int * terms_per_document;
+		if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL)  {
+			fprintf(stderr, "Unable to create vector\n");
+			return 1;
+		}
+		
+		if (string == NULL) {
+			fprintf(stderr, "Unable to parse message for terms\n");
+			return 1;
+		}
+		
+		while ((token = strsep(&string, ".,?! ")) != NULL) {
+			int word_found = 0;
+			int is_stop_word = 0;
+			
+			if (strcmp(token, "") != 0) {
+				for (int j = 0; j < word_count; j++) {
+					if (strcasecmp(token, dictionary[j]->word) == 0) {
+						dictionary[j]->count++;
+						terms_per_document[j]++;
+						word_found = 1;
+					}
+				}
+			
+				if (stop_words != NULL) {
+					for (int j = 0; j < stop_word_count; j++) {
+						if (strcasecmp(token, stop_words[j]) == 0) {
+							is_stop_word = 1;
+						}
+					}
+				}
+			
+				if (!word_found && !is_stop_word) {
+					word_count++;
+					if (word_count > allocated) {
+						dictionary_word ** more_words;
+						if ((more_words = realloc(dictionary, 
+								sizeof(dictionary_word *) * (2 * word_count))) == NULL) {
+							fprintf(stderr, "Unable to allocate memory for dictionary\n");
+							return 1;
+						}
+					
+						dictionary = more_words;
+						allocated = 2 * word_count;
+						
+						int * more_document_counts;
+						if ((more_document_counts = realloc(terms_per_document,
+								sizeof(int) * allocated)) == NULL) {
+							fprintf(stderr, "Unable to allocate memory for term frequencies\n");
+							return 1;
+						}
+						terms_per_document = more_document_counts;
+					}
+				
+					if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) {
+						fprintf(stderr, "Unable to allocate memory for dictionary word\n");
+						return 1;
+					}
+					if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) {
+						fprintf(stderr, "Unable to allocate memory for dictionary word\n");
+						return 1;
+					}
+				
+					strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1);
+					dictionary[word_count - 1]->count = 1;
+					terms_per_document[word_count - 1] = 1;
+				}
+			}
+		}
+		
+		for (int j = 0; j < word_count; j++) {
+			if (terms_per_document[j] > 0) {
+				dictionary[j]->document_count++;
+			}
+		}
+		
+		free(string);
+	}
+
+	qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings);
+	
+	if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) {
+		fprintf(stderr, "Unable to allocate memory for Model Vector\n");
+		return 1;
+	}
+
+	for (int i = 0; i < size; i++) {
+		if (i < word_count) {
+			dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word));
+			dataset->vector_document_counts[i] = dictionary[i]->document_count;
+		} else {
+			dataset->vector_terms[i] = strdup("");
+			dataset->vector_document_counts[i] = 0;
+		}
+
+	}
+		
+	fprintf(stdout, "Found %d different words\n", word_count);
+	
+	free(dictionary);
+	
+	return 0;
+}
+
+int compare_strings(const void * a, const void * b) {
+	const dictionary_word * word_a = *(dictionary_word **) a;
+	const dictionary_word * word_b = *(dictionary_word **) b;
+
+	return (word_b->count - word_a->count);
+}
+
+int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) {
+	// Use Grand Central Dispatch and Blocks to multithread this task for performance
+	dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
+		char * token, * string;
+		string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
+		
+		if (string == NULL) {
+			fprintf(stderr, "Unable to parse message for terms\n");
+		}
+		
+		for (int index = 0; index < size; index++) {
+			dataset->instances[i]->text_vector[index] = 0;
+		}
+		
+		while ((token = strsep(&string, ".,?! ")) != NULL) {
+			for (int index = 0; index < size; index++) {
+				if (strcasecmp(token, vector_terms[index]) == 0) {
+					dataset->instances[i]->text_vector[index]++;
+				}
+			}
+		}
+		
+		for (int index = 0; index < size; index++) {				
+			double tf = dataset->instances[i]->text_vector[index];
+			double docs = 1;
+			if (vector_document_counts[i] != 0) {
+				docs = vector_document_counts[i];
+			}
+			double idf = log(dataset->count/docs);
+			double tfidf = tf * idf;
+			
+			dataset->instances[i]->text_vector[index] = tfidf;
+		}
+	});
+	
+	return 0;
+}