initial github commit

Implementation of backprop in C using Grand Central Dispatch and Blocks
This commit is contained in:
James Griffin
2014-08-06 15:12:09 -03:00
commit 6f23634e32
6 changed files with 1926 additions and 0 deletions

507
process.c Normal file
View File

@@ -0,0 +1,507 @@
/*
Author: James Griffin-Allwood
Date: March 4 2014
Description: Implementations of reading in messages formatted
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include <math.h>
#include <Block.h>
#include <dispatch/dispatch.h>
#include "process.h"
typedef struct dictionary_word {
char * word;
int count;
int document_count;
} dictionary_word;
int free_data(data * to_free) {
if (to_free == NULL) {
return 1;
}
if (to_free->instances != NULL) {
free(to_free->instances);
}
if (to_free->vector_terms != NULL){
free(to_free->vector_terms);
}
free(to_free);
return 0;
}
struct data * read_data(char * file) {
FILE * temp;
data * data_buffer;
char * line;
char * message_buffer;
int class = -1;
char * pro = "<Pros>";
char * pro_close = "</Pros>";
char * con = "<Cons>";
char * con_close = "</Cons>";
char * unknown = "<Labs>";
char * unknown_close = "</Labs>";
int lines = 0;
int max_line_size = 0;
int line_count = 0;
char c;
if ((temp = fopen(file, "r")) == NULL) {
exit(EXIT_FAILURE);
}
while ((c = fgetc(temp)) != EOF) {
line_count++;
if (c == '\n') {
++lines;
if (line_count > max_line_size)
max_line_size = line_count;
line_count = 0;
}
}
rewind(temp);
if ((data_buffer = calloc(1, sizeof(data))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
exit(EXIT_FAILURE);
}
if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
exit(EXIT_FAILURE);
}
if ((line = malloc(sizeof(char) * max_line_size)) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be read\n");
}
data_buffer->count = lines;
for (int i = 0; i < lines; i++) {
if (fgets(line, max_line_size, temp) != NULL) {
if (strstr(line, pro) != NULL) {
char * start = strstr(line, pro) + (sizeof(char) * strlen(pro));
char * end = strstr(line, pro_close);
message_buffer = strndup(start, end - start);
class = PRO;
} else if (strstr(line, con) != NULL) {
char * start = strstr(line, con) + (sizeof(char) * strlen(con));
char * end = strstr(line, con_close);
message_buffer = strndup(start, end - start);
class = CON;
} else if (strstr(line, unknown) != NULL) {
char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown));
char * end = strstr(line, unknown_close);
message_buffer = strndup(start, end - start);
class = UNKNOWN;
}else {
message_buffer = "";
}
data_buffer->instances[i] = calloc(1, sizeof(message));
data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer));
data_buffer->instances[i]->class = class;
}
}
free(line);
if (fclose(temp) == EOF) {
exit(EXIT_FAILURE);
}
return data_buffer;
}
int weka_output(data * print_data, char * out_file) {
FILE * out;
if ((out= fopen(out_file, "w")) == NULL) {
return 1;
}
fprintf(out, "@relation 'Pro/Con Message Classification'\n");
fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n");
for (int i = 0; i < print_data->count; i++) {
if (print_data->instances[i]->class == UNKNOWN) {
fprintf(out, "'%s',?\n",
escape_single_quote(print_data->instances[i]->text));
} else {
fprintf(out, "'%s',%d\n",
escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class);
}
}
if (fclose(out) == EOF) {
return 1;
}
return 0;
}
int csv_output(data * print_data, char * out_file) {
FILE * out;
char * pro = "Pro";
char * con = "Con";
if ((out= fopen(out_file, "w")) == NULL) {
return 1;
}
for (int i = 0; i < print_data->count; i++) {
if (print_data->instances[i]->prediction == UNKNOWN) {
fprintf(out, "?,'%s'\n",
escape_single_quote(print_data->instances[i]->text));
} else {
if (print_data->instances[i]->prediction_probability[0]
> print_data->instances[i]->prediction_probability[1]) {
fprintf(out, "%s,'%s'\n", con,
escape_single_quote(print_data->instances[i]->text));
} else {
fprintf(out, "%s,'%s'\n", pro,
escape_single_quote(print_data->instances[i]->text));
}
}
}
if (fclose(out) == EOF) {
return 1;
}
return 0;
}
char * escape_single_quote(const char *str) {
char * ret, * r;
const char * p, * q;
size_t oldlen = strlen("'");
size_t count, retlen, newlen = strlen("\\'");
for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){
count++;
}
retlen = p - str + strlen(p) + count * (newlen - oldlen);
if ((ret = malloc(retlen + 1)) == NULL)
return NULL;
for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) {
ptrdiff_t l = q - p;
memcpy(r, p, l);
r += l;
memcpy(r, "\\'", newlen);
r += newlen;
}
strcpy(r, p);
return ret;
}
char ** load_stop_words(char * filename, int * word_count) {
FILE * temp;
char * line;
int index = 0;
char ** words;
* word_count = 0;
if ((temp = fopen(filename, "r")) == NULL) {
exit(EXIT_FAILURE);
}
if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) {
fprintf(stderr, "Unable to allocate memory for stop words to be read\n");
return NULL;
}
while (fscanf(temp, "%s\n", line) == 1) {
(* word_count)++;
}
rewind(temp);
if ((words = calloc(* word_count, sizeof(char *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for stop words\n");
return NULL;
}
while (fscanf(temp, "%s", line) == 1) {
words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH));
index++;
}
free(line);
if (fclose(temp) == EOF) {
exit(EXIT_FAILURE);
}
return words;
}
int train_test_split(const data * dataset, const int percent, data * train, data * test) {
int total_instances = dataset->count;
double train_percent = (100 - percent) / 100.0;
int train_instances = (int)(total_instances * train_percent);
int test_instances = total_instances - train_instances;
int train_index = 0;
int test_indexes[test_instances];
if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
return 1;
}
if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
return 1;
}
for (int i = 0; i < test_instances; i++) {
int random_index = -1;
int new_index = 0;
while (!new_index) {
int is_found = 0;
random_index = arc4random_uniform(total_instances);
for (int j = 0; j < i; j++) {
if (test_indexes[j] == random_index) {
is_found = 1;
}
}
if (!is_found) {
new_index = 1;
}
}
test_indexes[i] = random_index;
if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
return 1;
}
test->instances[i]->text
= strndup(dataset->instances[random_index]->text,
strlen(dataset->instances[random_index]->text));
test->instances[i]->class = dataset->instances[random_index]->class;
test->instances[i]->prediction = dataset->instances[random_index]->prediction;
memcpy(test->instances[i]->text_vector,
dataset->instances[random_index]->text_vector,
sizeof(dataset->instances[random_index]->text_vector));
}
test->count = test_instances;
for (int i = 0; i < total_instances; i++) {
int is_test = 0;
for (int j = 0; j < test_instances; j++) {
if (i == test_indexes[j]) {
is_test = 1;
}
}
if (!is_test) {
if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) {
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
return 1;
}
train->instances[train_index]->text
= strndup(dataset->instances[i]->text,
strlen(dataset->instances[i]->text));
train->instances[train_index]->class = dataset->instances[i]->class;
train->instances[train_index]->prediction = dataset->instances[i]->prediction;
memcpy(train->instances[train_index]->text_vector,
dataset->instances[i]->text_vector,
sizeof(dataset->instances[i]->text_vector));
train_index++;
}
}
train->count = train_instances;
return 0;
}
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) {
dictionary_word ** dictionary;
int word_count = 0;
int allocated = size;
// 0 out the vector counts for dataset
for (int i = 0; i < size; i++) {
dataset->vector_document_counts[i] = 0;
}
if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for dictionary\n");
return 1;
}
fprintf(stdout,"%d instances considered for vector\n", dataset->count);
for (int i = 0; i < dataset->count; i++) {
char * token, * string;
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
int * terms_per_document;
if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL) {
fprintf(stderr, "Unable to create vector\n");
return 1;
}
if (string == NULL) {
fprintf(stderr, "Unable to parse message for terms\n");
return 1;
}
while ((token = strsep(&string, ".,?! ")) != NULL) {
int word_found = 0;
int is_stop_word = 0;
if (strcmp(token, "") != 0) {
for (int j = 0; j < word_count; j++) {
if (strcasecmp(token, dictionary[j]->word) == 0) {
dictionary[j]->count++;
terms_per_document[j]++;
word_found = 1;
}
}
if (stop_words != NULL) {
for (int j = 0; j < stop_word_count; j++) {
if (strcasecmp(token, stop_words[j]) == 0) {
is_stop_word = 1;
}
}
}
if (!word_found && !is_stop_word) {
word_count++;
if (word_count > allocated) {
dictionary_word ** more_words;
if ((more_words = realloc(dictionary,
sizeof(dictionary_word *) * (2 * word_count))) == NULL) {
fprintf(stderr, "Unable to allocate memory for dictionary\n");
return 1;
}
dictionary = more_words;
allocated = 2 * word_count;
int * more_document_counts;
if ((more_document_counts = realloc(terms_per_document,
sizeof(int) * allocated)) == NULL) {
fprintf(stderr, "Unable to allocate memory for term frequencies\n");
return 1;
}
terms_per_document = more_document_counts;
}
if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) {
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
return 1;
}
if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) {
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
return 1;
}
strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1);
dictionary[word_count - 1]->count = 1;
terms_per_document[word_count - 1] = 1;
}
}
}
for (int j = 0; j < word_count; j++) {
if (terms_per_document[j] > 0) {
dictionary[j]->document_count++;
}
}
free(string);
}
qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings);
if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) {
fprintf(stderr, "Unable to allocate memory for Model Vector\n");
return 1;
}
for (int i = 0; i < size; i++) {
if (i < word_count) {
dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word));
dataset->vector_document_counts[i] = dictionary[i]->document_count;
} else {
dataset->vector_terms[i] = strdup("");
dataset->vector_document_counts[i] = 0;
}
}
fprintf(stdout, "Found %d different words\n", word_count);
free(dictionary);
return 0;
}
int compare_strings(const void * a, const void * b) {
const dictionary_word * word_a = *(dictionary_word **) a;
const dictionary_word * word_b = *(dictionary_word **) b;
return (word_b->count - word_a->count);
}
int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) {
// Use Grand Central Dispatch and Blocks to multithread this task for performance
dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
char * token, * string;
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
if (string == NULL) {
fprintf(stderr, "Unable to parse message for terms\n");
}
for (int index = 0; index < size; index++) {
dataset->instances[i]->text_vector[index] = 0;
}
while ((token = strsep(&string, ".,?! ")) != NULL) {
for (int index = 0; index < size; index++) {
if (strcasecmp(token, vector_terms[index]) == 0) {
dataset->instances[i]->text_vector[index]++;
}
}
}
for (int index = 0; index < size; index++) {
double tf = dataset->instances[i]->text_vector[index];
double docs = 1;
if (vector_document_counts[i] != 0) {
docs = vector_document_counts[i];
}
double idf = log(dataset->count/docs);
double tfidf = tf * idf;
dataset->instances[i]->text_vector[index] = tfidf;
}
});
return 0;
}