/* Author: James Griffin-Allwood Date: March 4 2014 Description: Implementations of reading in messages formatted */ #include #include #include #include #include #include #include #include "process.h" typedef struct dictionary_word { char * word; int count; int document_count; } dictionary_word; int free_data(data * to_free) { if (to_free == NULL) { return 1; } if (to_free->instances != NULL) { free(to_free->instances); } if (to_free->vector_terms != NULL){ free(to_free->vector_terms); } free(to_free); return 0; } struct data * read_data(char * file) { FILE * temp; data * data_buffer; char * line; char * message_buffer; int class = -1; char * pro = ""; char * pro_close = ""; char * con = ""; char * con_close = ""; char * unknown = ""; char * unknown_close = ""; int lines = 0; int max_line_size = 0; int line_count = 0; char c; if ((temp = fopen(file, "r")) == NULL) { exit(EXIT_FAILURE); } while ((c = fgetc(temp)) != EOF) { line_count++; if (c == '\n') { ++lines; if (line_count > max_line_size) max_line_size = line_count; line_count = 0; } } rewind(temp); if ((data_buffer = calloc(1, sizeof(data))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); exit(EXIT_FAILURE); } if ((data_buffer->instances = calloc(lines, sizeof(message *))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); exit(EXIT_FAILURE); } if ((line = malloc(sizeof(char) * max_line_size)) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be read\n"); } data_buffer->count = lines; for (int i = 0; i < lines; i++) { if (fgets(line, max_line_size, temp) != NULL) { if (strstr(line, pro) != NULL) { char * start = strstr(line, pro) + (sizeof(char) * strlen(pro)); char * end = strstr(line, pro_close); message_buffer = strndup(start, end - start); class = PRO; } else if (strstr(line, con) != NULL) { char * start = strstr(line, con) + (sizeof(char) * strlen(con)); char * end = strstr(line, con_close); message_buffer = strndup(start, end - start); class = CON; } else if (strstr(line, unknown) != NULL) { char * start = strstr(line, unknown) + (sizeof(char) * strlen(unknown)); char * end = strstr(line, unknown_close); message_buffer = strndup(start, end - start); class = UNKNOWN; }else { message_buffer = ""; } data_buffer->instances[i] = calloc(1, sizeof(message)); data_buffer->instances[i]->text = strndup(message_buffer, strlen(message_buffer)); data_buffer->instances[i]->class = class; } } free(line); if (fclose(temp) == EOF) { exit(EXIT_FAILURE); } return data_buffer; } int weka_output(data * print_data, char * out_file) { FILE * out; if ((out= fopen(out_file, "w")) == NULL) { return 1; } fprintf(out, "@relation 'Pro/Con Message Classification'\n"); fprintf(out, "@attribute 'message' string\n@attribute 'class' {0,1}\n\n@data\n\n"); for (int i = 0; i < print_data->count; i++) { if (print_data->instances[i]->class == UNKNOWN) { fprintf(out, "'%s',?\n", escape_single_quote(print_data->instances[i]->text)); } else { fprintf(out, "'%s',%d\n", escape_single_quote(print_data->instances[i]->text), print_data->instances[i]->class); } } if (fclose(out) == EOF) { return 1; } return 0; } int csv_output(data * print_data, char * out_file) { FILE * out; char * pro = "Pro"; char * con = "Con"; if ((out= fopen(out_file, "w")) == NULL) { return 1; } for (int i = 0; i < print_data->count; i++) { if (print_data->instances[i]->prediction == UNKNOWN) { fprintf(out, "?,'%s'\n", escape_single_quote(print_data->instances[i]->text)); } else { if (print_data->instances[i]->prediction_probability[0] > print_data->instances[i]->prediction_probability[1]) { fprintf(out, "%s,'%s'\n", con, escape_single_quote(print_data->instances[i]->text)); } else { fprintf(out, "%s,'%s'\n", pro, escape_single_quote(print_data->instances[i]->text)); } } } if (fclose(out) == EOF) { return 1; } return 0; } char * escape_single_quote(const char *str) { char * ret, * r; const char * p, * q; size_t oldlen = strlen("'"); size_t count, retlen, newlen = strlen("\\'"); for (count = 0, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen){ count++; } retlen = p - str + strlen(p) + count * (newlen - oldlen); if ((ret = malloc(retlen + 1)) == NULL) return NULL; for (r = ret, p = str; ((q = strstr(p, "'")) != NULL); p = q + oldlen) { ptrdiff_t l = q - p; memcpy(r, p, l); r += l; memcpy(r, "\\'", newlen); r += newlen; } strcpy(r, p); return ret; } char ** load_stop_words(char * filename, int * word_count) { FILE * temp; char * line; int index = 0; char ** words; * word_count = 0; if ((temp = fopen(filename, "r")) == NULL) { exit(EXIT_FAILURE); } if ((line = malloc(sizeof(char) * MAX_TERM_LENGTH)) == NULL) { fprintf(stderr, "Unable to allocate memory for stop words to be read\n"); return NULL; } while (fscanf(temp, "%s\n", line) == 1) { (* word_count)++; } rewind(temp); if ((words = calloc(* word_count, sizeof(char *))) == NULL) { fprintf(stderr, "Unable to allocate memory for stop words\n"); return NULL; } while (fscanf(temp, "%s", line) == 1) { words[index] = strndup(line, strnlen(line, MAX_TERM_LENGTH)); index++; } free(line); if (fclose(temp) == EOF) { exit(EXIT_FAILURE); } return words; } int train_test_split(const data * dataset, const int percent, data * train, data * test) { int total_instances = dataset->count; double train_percent = (100 - percent) / 100.0; int train_instances = (int)(total_instances * train_percent); int test_instances = total_instances - train_instances; int train_index = 0; int test_indexes[test_instances]; if ((train->instances = calloc(train_instances, sizeof(message *))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } if ((test->instances = calloc(test_instances, sizeof(message *))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } for (int i = 0; i < test_instances; i++) { int random_index = -1; int new_index = 0; while (!new_index) { int is_found = 0; random_index = arc4random_uniform(total_instances); for (int j = 0; j < i; j++) { if (test_indexes[j] == random_index) { is_found = 1; } } if (!is_found) { new_index = 1; } } test_indexes[i] = random_index; if ((test->instances[i] = calloc(1, sizeof(message))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } test->instances[i]->text = strndup(dataset->instances[random_index]->text, strlen(dataset->instances[random_index]->text)); test->instances[i]->class = dataset->instances[random_index]->class; test->instances[i]->prediction = dataset->instances[random_index]->prediction; memcpy(test->instances[i]->text_vector, dataset->instances[random_index]->text_vector, sizeof(dataset->instances[random_index]->text_vector)); } test->count = test_instances; for (int i = 0; i < total_instances; i++) { int is_test = 0; for (int j = 0; j < test_instances; j++) { if (i == test_indexes[j]) { is_test = 1; } } if (!is_test) { if ((train->instances[train_index] = calloc(1, sizeof(message))) == NULL) { fprintf(stderr, "Unable to allocate memory for messages to be stored\n"); return 1; } train->instances[train_index]->text = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text)); train->instances[train_index]->class = dataset->instances[i]->class; train->instances[train_index]->prediction = dataset->instances[i]->prediction; memcpy(train->instances[train_index]->text_vector, dataset->instances[i]->text_vector, sizeof(dataset->instances[i]->text_vector)); train_index++; } } train->count = train_instances; return 0; } int create_vector_represntation(data * dataset, char ** stop_words, const int stop_word_count, const int size) { dictionary_word ** dictionary; int word_count = 0; int allocated = size; // 0 out the vector counts for dataset for (int i = 0; i < size; i++) { dataset->vector_document_counts[i] = 0; } if ((dictionary = calloc(size, sizeof(dictionary_word *))) == NULL) { fprintf(stderr, "Unable to allocate memory for dictionary\n"); return 1; } fprintf(stdout,"%d instances considered for vector\n", dataset->count); for (int i = 0; i < dataset->count; i++) { char * token, * string; string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text)); int * terms_per_document; if ((terms_per_document = calloc(allocated, sizeof(int))) == NULL) { fprintf(stderr, "Unable to create vector\n"); return 1; } if (string == NULL) { fprintf(stderr, "Unable to parse message for terms\n"); return 1; } while ((token = strsep(&string, ".,?! ")) != NULL) { int word_found = 0; int is_stop_word = 0; if (strcmp(token, "") != 0) { for (int j = 0; j < word_count; j++) { if (strcasecmp(token, dictionary[j]->word) == 0) { dictionary[j]->count++; terms_per_document[j]++; word_found = 1; } } if (stop_words != NULL) { for (int j = 0; j < stop_word_count; j++) { if (strcasecmp(token, stop_words[j]) == 0) { is_stop_word = 1; } } } if (!word_found && !is_stop_word) { word_count++; if (word_count > allocated) { dictionary_word ** more_words; if ((more_words = realloc(dictionary, sizeof(dictionary_word *) * (2 * word_count))) == NULL) { fprintf(stderr, "Unable to allocate memory for dictionary\n"); return 1; } dictionary = more_words; allocated = 2 * word_count; int * more_document_counts; if ((more_document_counts = realloc(terms_per_document, sizeof(int) * allocated)) == NULL) { fprintf(stderr, "Unable to allocate memory for term frequencies\n"); return 1; } terms_per_document = more_document_counts; } if ((dictionary[word_count - 1] = calloc(1, sizeof(dictionary_word))) == NULL) { fprintf(stderr, "Unable to allocate memory for dictionary word\n"); return 1; } if ((dictionary[word_count - 1]->word = calloc(strlen(token) + 1, sizeof(char))) == NULL) { fprintf(stderr, "Unable to allocate memory for dictionary word\n"); return 1; } strlcpy(dictionary[word_count - 1]->word, token, strlen(token) + 1); dictionary[word_count - 1]->count = 1; terms_per_document[word_count - 1] = 1; } } } for (int j = 0; j < word_count; j++) { if (terms_per_document[j] > 0) { dictionary[j]->document_count++; } } free(string); } qsort(dictionary, word_count, sizeof(dictionary_word *), compare_strings); if ((dataset->vector_terms = calloc(size, sizeof(char *))) == NULL) { fprintf(stderr, "Unable to allocate memory for Model Vector\n"); return 1; } for (int i = 0; i < size; i++) { if (i < word_count) { dataset->vector_terms[i] = strndup(dictionary[i]->word, strlen(dictionary[i]->word)); dataset->vector_document_counts[i] = dictionary[i]->document_count; } else { dataset->vector_terms[i] = strdup(""); dataset->vector_document_counts[i] = 0; } } fprintf(stdout, "Found %d different words\n", word_count); free(dictionary); return 0; } int compare_strings(const void * a, const void * b) { const dictionary_word * word_a = *(dictionary_word **) a; const dictionary_word * word_b = *(dictionary_word **) b; return (word_b->count - word_a->count); } int vector_representation(struct data * dataset, char ** vector_terms, int * vector_document_counts, const int size) { // Use Grand Central Dispatch and Blocks to multithread this task for performance dispatch_apply(dataset->count, dispatch_get_global_queue(0, 0), ^ (size_t i) { char * token, * string; string = strndup(dataset->instances[i]->text, strlen(dataset->instances[i]->text)); if (string == NULL) { fprintf(stderr, "Unable to parse message for terms\n"); } for (int index = 0; index < size; index++) { dataset->instances[i]->text_vector[index] = 0; } while ((token = strsep(&string, ".,?! ")) != NULL) { for (int index = 0; index < size; index++) { if (strcasecmp(token, vector_terms[index]) == 0) { dataset->instances[i]->text_vector[index]++; } } } for (int index = 0; index < size; index++) { double tf = dataset->instances[i]->text_vector[index]; double docs = 1; if (vector_document_counts[i] != 0) { docs = vector_document_counts[i]; } double idf = log(dataset->count/docs); double tfidf = tf * idf; dataset->instances[i]->text_vector[index] = tfidf; } }); return 0; }