backprop/process.c

/*
	Author: James Griffin-Allwood
	Date: March 4 2014

	Description: Implementations of reading in messages formatted
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include <math.h>
#include <Block.h>
#include <dispatch/dispatch.h>
#include "process.h"

typedef struct dictionary_word {
	char * word;
	int count;
	int document_count;
} dictionary_word;

int free_data(data * to_free) {
	if (to_free == NULL) {
		return 1;
	}

	if (to_free->instances != NULL) {
		free(to_free->instances);
	}

	if (to_free->vector_terms != NULL){
		free(to_free->vector_terms);
	}

	free(to_free);

	return 0;
}

struct data * read_data(char * file) {
	FILE * temp;
	data * data_buffer;
	char * line;
	char * message_buffer;
	int class = -1;

	char * pro = "<Pros>";
	char * pro_close = "</Pros>";
	char * con = "<Cons>";
	char * con_close = "</Cons>";
	char * unknown = "<Labs>";
	char * unknown_close = "</Labs>";

	int lines = 0;
	int max_line_size = 0;
	int line_count = 0;
	char c;

	if ((temp = fopen(file, "r")) == NULL) {
		exit(EXIT_FAILURE);
	}

    while ((c = fgetc(temp)) != EOF) {
		line_count++;
		if (c == '\n') {
			++lines;
			if (line_count > max_line_size)
				max_line_size = line_count;
			line_count = 0;
		}
	}

	rewind(temp);

	if ((data_buffer = calloc(1, sizeof(data))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
		exit(EXIT_FAILURE);
	}

	if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
		exit(EXIT_FAILURE);
	}

	if ((line = malloc(sizeof(char) * max_line_size)) == NULL) {
		fprintf(stderr, "Unable to allocate memory for messages to be read\n");
	}

	data_buffer->count = lines;

	for (int i = 0; i < lines; i++) {
		if (fgets(line, max_line_size, temp) != NULL) {
			if (strstr(line, pro) != NULL) {
				char * start = strstr(line, pro) + (sizeof(char) * strlen(pro));
				char * end = strstr(line, pro_close);
				message_buffer = strndup(start, end - start);
				class = PRO;
			} else if (strstr(line, con) != NULL) {
				char * start = strstr(line, con) + (sizeof(char) * strlen(con));
				char * end = strstr(line, con_close);
				message_buffer = strndup(start, end - start);
				class = CON;
			} else if (strstr(line, unknown) != NULL) {
				char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown));
				char * end = strstr(line, unknown_close);
				message_buffer = strndup(start, end - start);
				class = UNKNOWN;
			}else {
				message_buffer = "";
			}

			data_buffer->instances[i] = calloc(1, sizeof(message));

			data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer));
			data_buffer->instances[i]->class = class;
		}
	}

	free(line);

	if (fclose(temp) == EOF) {
		exit(EXIT_FAILURE);
	}

	return data_buffer;
}

int weka_output(data * print_data, char * out_file) {
	FILE * out;

	if ((out= fopen(out_file, "w")) == NULL) {
		return 1;
	}

	fprintf(out, "@relation 'Pro/Con Message Classification'\n");
	fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n");

	for (int i = 0; i < print_data->count; i++) {
		if (print_data->instances[i]->class == UNKNOWN) {
			fprintf(out, "'%s',?\n",
				escape_single_quote(print_data->instances[i]->text));
		} else {
			fprintf(out, "'%s',%d\n",
				escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class);
		}
	}

	if (fclose(out) == EOF) {
		return 1;
	}

	return 0;
}

int csv_output(data * print_data, char * out_file) {
	FILE * out;
	char * pro = "Pro";
	char * con = "Con";

	if ((out= fopen(out_file, "w")) == NULL) {
		return 1;
	}

	for (int i = 0; i < print_data->count; i++) {
		if (print_data->instances[i]->prediction == UNKNOWN) {
			fprintf(out, "?,'%s'\n",
				escape_single_quote(print_data->instances[i]->text));
		} else {
			if (print_data->instances[i]->prediction_probability[0]
				> print_data->instances[i]->prediction_probability[1]) {
				fprintf(out, "%s,'%s'\n", con,
					escape_single_quote(print_data->instances[i]->text));
			} else {
				fprintf(out, "%s,'%s'\n", pro,
					escape_single_quote(print_data->instances[i]->text));
			}
		}
	}

	if (fclose(out) == EOF) {
		return 1;
	}

	return 0;
}

char * escape_single_quote(const char *str) {
	char * ret, * r;
	const char * p, * q;
	size_t oldlen = strlen("'");
	size_t count, retlen, newlen = strlen("\\'");

	for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){
		count++;
	}
	retlen = p - str + strlen(p) + count * (newlen - oldlen);

	if ((ret = malloc(retlen + 1)) == NULL)
		return NULL;

	for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) {
		ptrdiff_t l = q - p;
		memcpy(r, p, l);
		r += l;
		memcpy(r, "\\'", newlen);
		r += newlen;
	}
	strcpy(r, p);

	return ret;
}

char ** load_stop_words(char * filename, int * word_count) {
	FILE * temp;
	char * line;
	int index = 0;
	char ** words;

	* word_count = 0;

	if ((temp = fopen(filename, "r")) == NULL) {
		exit(EXIT_FAILURE);
	}

	if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) {
		fprintf(stderr, "Unable to allocate memory for stop words to be read\n");
		return NULL;
	}
	while (fscanf(temp, "%s\n", line) == 1) {
		(* word_count)++;
	}

	rewind(temp);

	if ((words = calloc(* word_count, sizeof(char *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for stop words\n");
		return NULL;
	}

	while (fscanf(temp, "%s", line) == 1) {
		words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH));
		index++;
	}

	free(line);

	if (fclose(temp) == EOF) {
		exit(EXIT_FAILURE);
	}

	return words;
}

int train_test_split(const data * dataset, const int percent, data * train, data * test) {
	int total_instances = dataset->count;
	double train_percent = (100 - percent) / 100.0;
	int train_instances = (int)(total_instances * train_percent);
	int test_instances = total_instances - train_instances;
	int train_index = 0;
	int test_indexes[test_instances];

	if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
		return 1;
	}

	if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
		return 1;
	}

	for (int i = 0; i < test_instances; i++) {
		int random_index = -1;
		int new_index = 0;

		while (!new_index) {
			int is_found = 0;
			random_index = arc4random_uniform(total_instances);
			for (int j = 0; j < i; j++) {
				if (test_indexes[j] == random_index) {
					is_found = 1;
				}
			}
			if (!is_found) {
				new_index = 1;
			}
		}

		test_indexes[i] = random_index;
		if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) {
			fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
			return 1;
		}
		test->instances[i]->text
			= strndup(dataset->instances[random_index]->text,
					strlen(dataset->instances[random_index]->text));

		test->instances[i]->class = dataset->instances[random_index]->class;
		test->instances[i]->prediction = dataset->instances[random_index]->prediction;
		memcpy(test->instances[i]->text_vector,
				dataset->instances[random_index]->text_vector,
				sizeof(dataset->instances[random_index]->text_vector));
	}

	test->count = test_instances;

	for (int i = 0; i < total_instances; i++) {
		int is_test = 0;

		for (int j = 0; j < test_instances; j++) {
			if (i == test_indexes[j]) {
				is_test = 1;
			}
		}

		if (!is_test) {
			if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) {
				fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
				return 1;
			}
			train->instances[train_index]->text
				= strndup(dataset->instances[i]->text,
						strlen(dataset->instances[i]->text));
			train->instances[train_index]->class = dataset->instances[i]->class;
			train->instances[train_index]->prediction = dataset->instances[i]->prediction;
			memcpy(train->instances[train_index]->text_vector,
					dataset->instances[i]->text_vector,
					sizeof(dataset->instances[i]->text_vector));

			train_index++;
		}
	}

	train->count = train_instances;

	return 0;
}

int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) {
	dictionary_word ** dictionary;
	int word_count = 0;
	int allocated = size;

	// 0 out the vector counts for dataset
	for (int i = 0; i < size; i++) {
		dataset->vector_document_counts[i] = 0;
	}

	if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for dictionary\n");
		return 1;
	}

	fprintf(stdout,"%d instances considered for vector\n", dataset->count);

	for (int i = 0; i < dataset->count; i++) {
		char * token, * string;
		string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
		int * terms_per_document;
		if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL)  {
			fprintf(stderr, "Unable to create vector\n");
			return 1;
		}

		if (string == NULL) {
			fprintf(stderr, "Unable to parse message for terms\n");
			return 1;
		}

		while ((token = strsep(&string, ".,?! ")) != NULL) {
			int word_found = 0;
			int is_stop_word = 0;

			if (strcmp(token, "") != 0) {
				for (int j = 0; j < word_count; j++) {
					if (strcasecmp(token, dictionary[j]->word) == 0) {
						dictionary[j]->count++;
						terms_per_document[j]++;
						word_found = 1;
					}
				}

				if (stop_words != NULL) {
					for (int j = 0; j < stop_word_count; j++) {
						if (strcasecmp(token, stop_words[j]) == 0) {
							is_stop_word = 1;
						}
					}
				}

				if (!word_found && !is_stop_word) {
					word_count++;
					if (word_count > allocated) {
						dictionary_word ** more_words;
						if ((more_words = realloc(dictionary,
								sizeof(dictionary_word *) * (2 * word_count))) == NULL) {
							fprintf(stderr, "Unable to allocate memory for dictionary\n");
							return 1;
						}

						dictionary = more_words;
						allocated = 2 * word_count;

						int * more_document_counts;
						if ((more_document_counts = realloc(terms_per_document,
								sizeof(int) * allocated)) == NULL) {
							fprintf(stderr, "Unable to allocate memory for term frequencies\n");
							return 1;
						}
						terms_per_document = more_document_counts;
					}

					if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) {
						fprintf(stderr, "Unable to allocate memory for dictionary word\n");
						return 1;
					}
					if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) {
						fprintf(stderr, "Unable to allocate memory for dictionary word\n");
						return 1;
					}

					strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1);
					dictionary[word_count - 1]->count = 1;
					terms_per_document[word_count - 1] = 1;
				}
			}
		}

		for (int j = 0; j < word_count; j++) {
			if (terms_per_document[j] > 0) {
				dictionary[j]->document_count++;
			}
		}

		free(string);
	}

	qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings);

	if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) {
		fprintf(stderr, "Unable to allocate memory for Model Vector\n");
		return 1;
	}

	for (int i = 0; i < size; i++) {
		if (i < word_count) {
			dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word));
			dataset->vector_document_counts[i] = dictionary[i]->document_count;
		} else {
			dataset->vector_terms[i] = strdup("");
			dataset->vector_document_counts[i] = 0;
		}

	}

	fprintf(stdout, "Found %d different words\n", word_count);

	free(dictionary);

	return 0;
}

int compare_strings(const void * a, const void * b) {
	const dictionary_word * word_a = *(dictionary_word **) a;
	const dictionary_word * word_b = *(dictionary_word **) b;

	return (word_b->count - word_a->count);
}

int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) {
	// Use Grand Central Dispatch and Blocks to multithread this task for performance
	dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
		char * token, * string;
		string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));

		if (string == NULL) {
			fprintf(stderr, "Unable to parse message for terms\n");
		}

		for (int index = 0; index < size; index++) {
			dataset->instances[i]->text_vector[index] = 0;
		}

		while ((token = strsep(&string, ".,?! ")) != NULL) {
			for (int index = 0; index < size; index++) {
				if (strcasecmp(token, vector_terms[index]) == 0) {
					dataset->instances[i]->text_vector[index]++;
				}
			}
		}

		for (int index = 0; index < size; index++) {
			double tf = dataset->instances[i]->text_vector[index];
			double docs = 1;
			if (vector_document_counts[i] != 0) {
				docs = vector_document_counts[i];
			}
			double idf = log(dataset->count/docs);
			double tfidf = tf * idf;

			dataset->instances[i]->text_vector[index] = tfidf;
		}
	});

	return 0;
}