508 lines
13 KiB
C
508 lines
13 KiB
C
/*
|
|
Author: James Griffin-Allwood
|
|
Date: March 4 2014
|
|
|
|
Description: Implementations of reading in messages formatted
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stddef.h>
|
|
#include <math.h>
|
|
#include <Block.h>
|
|
#include <dispatch/dispatch.h>
|
|
#include "process.h"
|
|
|
|
typedef struct dictionary_word {
|
|
char * word;
|
|
int count;
|
|
int document_count;
|
|
} dictionary_word;
|
|
|
|
int free_data(data * to_free) {
|
|
if (to_free == NULL) {
|
|
return 1;
|
|
}
|
|
|
|
if (to_free->instances != NULL) {
|
|
free(to_free->instances);
|
|
}
|
|
|
|
if (to_free->vector_terms != NULL){
|
|
free(to_free->vector_terms);
|
|
}
|
|
|
|
free(to_free);
|
|
|
|
return 0;
|
|
}
|
|
|
|
struct data * read_data(char * file) {
|
|
FILE * temp;
|
|
data * data_buffer;
|
|
char * line;
|
|
char * message_buffer;
|
|
int class = -1;
|
|
|
|
char * pro = "<Pros>";
|
|
char * pro_close = "</Pros>";
|
|
char * con = "<Cons>";
|
|
char * con_close = "</Cons>";
|
|
char * unknown = "<Labs>";
|
|
char * unknown_close = "</Labs>";
|
|
|
|
int lines = 0;
|
|
int max_line_size = 0;
|
|
int line_count = 0;
|
|
char c;
|
|
|
|
if ((temp = fopen(file, "r")) == NULL) {
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
while ((c = fgetc(temp)) != EOF) {
|
|
line_count++;
|
|
if (c == '\n') {
|
|
++lines;
|
|
if (line_count > max_line_size)
|
|
max_line_size = line_count;
|
|
line_count = 0;
|
|
}
|
|
}
|
|
|
|
rewind(temp);
|
|
|
|
if ((data_buffer = calloc(1, sizeof(data))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if ((line = malloc(sizeof(char) * max_line_size)) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be read\n");
|
|
}
|
|
|
|
data_buffer->count = lines;
|
|
|
|
for (int i = 0; i < lines; i++) {
|
|
if (fgets(line, max_line_size, temp) != NULL) {
|
|
if (strstr(line, pro) != NULL) {
|
|
char * start = strstr(line, pro) + (sizeof(char) * strlen(pro));
|
|
char * end = strstr(line, pro_close);
|
|
message_buffer = strndup(start, end - start);
|
|
class = PRO;
|
|
} else if (strstr(line, con) != NULL) {
|
|
char * start = strstr(line, con) + (sizeof(char) * strlen(con));
|
|
char * end = strstr(line, con_close);
|
|
message_buffer = strndup(start, end - start);
|
|
class = CON;
|
|
} else if (strstr(line, unknown) != NULL) {
|
|
char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown));
|
|
char * end = strstr(line, unknown_close);
|
|
message_buffer = strndup(start, end - start);
|
|
class = UNKNOWN;
|
|
}else {
|
|
message_buffer = "";
|
|
}
|
|
|
|
data_buffer->instances[i] = calloc(1, sizeof(message));
|
|
|
|
data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer));
|
|
data_buffer->instances[i]->class = class;
|
|
}
|
|
}
|
|
|
|
free(line);
|
|
|
|
if (fclose(temp) == EOF) {
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
return data_buffer;
|
|
}
|
|
|
|
int weka_output(data * print_data, char * out_file) {
|
|
FILE * out;
|
|
|
|
if ((out= fopen(out_file, "w")) == NULL) {
|
|
return 1;
|
|
}
|
|
|
|
fprintf(out, "@relation 'Pro/Con Message Classification'\n");
|
|
fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n");
|
|
|
|
for (int i = 0; i < print_data->count; i++) {
|
|
if (print_data->instances[i]->class == UNKNOWN) {
|
|
fprintf(out, "'%s',?\n",
|
|
escape_single_quote(print_data->instances[i]->text));
|
|
} else {
|
|
fprintf(out, "'%s',%d\n",
|
|
escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class);
|
|
}
|
|
}
|
|
|
|
if (fclose(out) == EOF) {
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int csv_output(data * print_data, char * out_file) {
|
|
FILE * out;
|
|
char * pro = "Pro";
|
|
char * con = "Con";
|
|
|
|
if ((out= fopen(out_file, "w")) == NULL) {
|
|
return 1;
|
|
}
|
|
|
|
for (int i = 0; i < print_data->count; i++) {
|
|
if (print_data->instances[i]->prediction == UNKNOWN) {
|
|
fprintf(out, "?,'%s'\n",
|
|
escape_single_quote(print_data->instances[i]->text));
|
|
} else {
|
|
if (print_data->instances[i]->prediction_probability[0]
|
|
> print_data->instances[i]->prediction_probability[1]) {
|
|
fprintf(out, "%s,'%s'\n", con,
|
|
escape_single_quote(print_data->instances[i]->text));
|
|
} else {
|
|
fprintf(out, "%s,'%s'\n", pro,
|
|
escape_single_quote(print_data->instances[i]->text));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (fclose(out) == EOF) {
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
char * escape_single_quote(const char *str) {
|
|
char * ret, * r;
|
|
const char * p, * q;
|
|
size_t oldlen = strlen("'");
|
|
size_t count, retlen, newlen = strlen("\\'");
|
|
|
|
for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){
|
|
count++;
|
|
}
|
|
retlen = p - str + strlen(p) + count * (newlen - oldlen);
|
|
|
|
if ((ret = malloc(retlen + 1)) == NULL)
|
|
return NULL;
|
|
|
|
for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) {
|
|
ptrdiff_t l = q - p;
|
|
memcpy(r, p, l);
|
|
r += l;
|
|
memcpy(r, "\\'", newlen);
|
|
r += newlen;
|
|
}
|
|
strcpy(r, p);
|
|
|
|
return ret;
|
|
}
|
|
|
|
char ** load_stop_words(char * filename, int * word_count) {
|
|
FILE * temp;
|
|
char * line;
|
|
int index = 0;
|
|
char ** words;
|
|
|
|
* word_count = 0;
|
|
|
|
if ((temp = fopen(filename, "r")) == NULL) {
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for stop words to be read\n");
|
|
return NULL;
|
|
}
|
|
while (fscanf(temp, "%s\n", line) == 1) {
|
|
(* word_count)++;
|
|
}
|
|
|
|
rewind(temp);
|
|
|
|
if ((words = calloc(* word_count, sizeof(char *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for stop words\n");
|
|
return NULL;
|
|
}
|
|
|
|
while (fscanf(temp, "%s", line) == 1) {
|
|
words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH));
|
|
index++;
|
|
}
|
|
|
|
free(line);
|
|
|
|
if (fclose(temp) == EOF) {
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
int train_test_split(const data * dataset, const int percent, data * train, data * test) {
|
|
int total_instances = dataset->count;
|
|
double train_percent = (100 - percent) / 100.0;
|
|
int train_instances = (int)(total_instances * train_percent);
|
|
int test_instances = total_instances - train_instances;
|
|
int train_index = 0;
|
|
int test_indexes[test_instances];
|
|
|
|
if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
return 1;
|
|
}
|
|
|
|
if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
return 1;
|
|
}
|
|
|
|
for (int i = 0; i < test_instances; i++) {
|
|
int random_index = -1;
|
|
int new_index = 0;
|
|
|
|
while (!new_index) {
|
|
int is_found = 0;
|
|
random_index = arc4random_uniform(total_instances);
|
|
for (int j = 0; j < i; j++) {
|
|
if (test_indexes[j] == random_index) {
|
|
is_found = 1;
|
|
}
|
|
}
|
|
if (!is_found) {
|
|
new_index = 1;
|
|
}
|
|
}
|
|
|
|
test_indexes[i] = random_index;
|
|
if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
return 1;
|
|
}
|
|
test->instances[i]->text
|
|
= strndup(dataset->instances[random_index]->text,
|
|
strlen(dataset->instances[random_index]->text));
|
|
|
|
test->instances[i]->class = dataset->instances[random_index]->class;
|
|
test->instances[i]->prediction = dataset->instances[random_index]->prediction;
|
|
memcpy(test->instances[i]->text_vector,
|
|
dataset->instances[random_index]->text_vector,
|
|
sizeof(dataset->instances[random_index]->text_vector));
|
|
}
|
|
|
|
test->count = test_instances;
|
|
|
|
for (int i = 0; i < total_instances; i++) {
|
|
int is_test = 0;
|
|
|
|
for (int j = 0; j < test_instances; j++) {
|
|
if (i == test_indexes[j]) {
|
|
is_test = 1;
|
|
}
|
|
}
|
|
|
|
if (!is_test) {
|
|
if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for messages to be stored\n");
|
|
return 1;
|
|
}
|
|
train->instances[train_index]->text
|
|
= strndup(dataset->instances[i]->text,
|
|
strlen(dataset->instances[i]->text));
|
|
train->instances[train_index]->class = dataset->instances[i]->class;
|
|
train->instances[train_index]->prediction = dataset->instances[i]->prediction;
|
|
memcpy(train->instances[train_index]->text_vector,
|
|
dataset->instances[i]->text_vector,
|
|
sizeof(dataset->instances[i]->text_vector));
|
|
|
|
train_index++;
|
|
}
|
|
}
|
|
|
|
train->count = train_instances;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) {
|
|
dictionary_word ** dictionary;
|
|
int word_count = 0;
|
|
int allocated = size;
|
|
|
|
// 0 out the vector counts for dataset
|
|
for (int i = 0; i < size; i++) {
|
|
dataset->vector_document_counts[i] = 0;
|
|
}
|
|
|
|
if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for dictionary\n");
|
|
return 1;
|
|
}
|
|
|
|
fprintf(stdout,"%d instances considered for vector\n", dataset->count);
|
|
|
|
for (int i = 0; i < dataset->count; i++) {
|
|
char * token, * string;
|
|
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
|
|
int * terms_per_document;
|
|
if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL) {
|
|
fprintf(stderr, "Unable to create vector\n");
|
|
return 1;
|
|
}
|
|
|
|
if (string == NULL) {
|
|
fprintf(stderr, "Unable to parse message for terms\n");
|
|
return 1;
|
|
}
|
|
|
|
while ((token = strsep(&string, ".,?! ")) != NULL) {
|
|
int word_found = 0;
|
|
int is_stop_word = 0;
|
|
|
|
if (strcmp(token, "") != 0) {
|
|
for (int j = 0; j < word_count; j++) {
|
|
if (strcasecmp(token, dictionary[j]->word) == 0) {
|
|
dictionary[j]->count++;
|
|
terms_per_document[j]++;
|
|
word_found = 1;
|
|
}
|
|
}
|
|
|
|
if (stop_words != NULL) {
|
|
for (int j = 0; j < stop_word_count; j++) {
|
|
if (strcasecmp(token, stop_words[j]) == 0) {
|
|
is_stop_word = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!word_found && !is_stop_word) {
|
|
word_count++;
|
|
if (word_count > allocated) {
|
|
dictionary_word ** more_words;
|
|
if ((more_words = realloc(dictionary,
|
|
sizeof(dictionary_word *) * (2 * word_count))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for dictionary\n");
|
|
return 1;
|
|
}
|
|
|
|
dictionary = more_words;
|
|
allocated = 2 * word_count;
|
|
|
|
int * more_document_counts;
|
|
if ((more_document_counts = realloc(terms_per_document,
|
|
sizeof(int) * allocated)) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for term frequencies\n");
|
|
return 1;
|
|
}
|
|
terms_per_document = more_document_counts;
|
|
}
|
|
|
|
if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
|
|
return 1;
|
|
}
|
|
if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for dictionary word\n");
|
|
return 1;
|
|
}
|
|
|
|
strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1);
|
|
dictionary[word_count - 1]->count = 1;
|
|
terms_per_document[word_count - 1] = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < word_count; j++) {
|
|
if (terms_per_document[j] > 0) {
|
|
dictionary[j]->document_count++;
|
|
}
|
|
}
|
|
|
|
free(string);
|
|
}
|
|
|
|
qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings);
|
|
|
|
if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) {
|
|
fprintf(stderr, "Unable to allocate memory for Model Vector\n");
|
|
return 1;
|
|
}
|
|
|
|
for (int i = 0; i < size; i++) {
|
|
if (i < word_count) {
|
|
dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word));
|
|
dataset->vector_document_counts[i] = dictionary[i]->document_count;
|
|
} else {
|
|
dataset->vector_terms[i] = strdup("");
|
|
dataset->vector_document_counts[i] = 0;
|
|
}
|
|
|
|
}
|
|
|
|
fprintf(stdout, "Found %d different words\n", word_count);
|
|
|
|
free(dictionary);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int compare_strings(const void * a, const void * b) {
|
|
const dictionary_word * word_a = *(dictionary_word **) a;
|
|
const dictionary_word * word_b = *(dictionary_word **) b;
|
|
|
|
return (word_b->count - word_a->count);
|
|
}
|
|
|
|
int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) {
|
|
// Use Grand Central Dispatch and Blocks to multithread this task for performance
|
|
dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) {
|
|
char * token, * string;
|
|
string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text));
|
|
|
|
if (string == NULL) {
|
|
fprintf(stderr, "Unable to parse message for terms\n");
|
|
}
|
|
|
|
for (int index = 0; index < size; index++) {
|
|
dataset->instances[i]->text_vector[index] = 0;
|
|
}
|
|
|
|
while ((token = strsep(&string, ".,?! ")) != NULL) {
|
|
for (int index = 0; index < size; index++) {
|
|
if (strcasecmp(token, vector_terms[index]) == 0) {
|
|
dataset->instances[i]->text_vector[index]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int index = 0; index < size; index++) {
|
|
double tf = dataset->instances[i]->text_vector[index];
|
|
double docs = 1;
|
|
if (vector_document_counts[i] != 0) {
|
|
docs = vector_document_counts[i];
|
|
}
|
|
double idf = log(dataset->count/docs);
|
|
double tfidf = tf * idf;
|
|
|
|
dataset->instances[i]->text_vector[index] = tfidf;
|
|
}
|
|
});
|
|
|
|
return 0;
|
|
}
|