#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>
#define LOG(...) fprintf(stderr,__VA_ARGS__);
typedef unsigned int uint;
typedef uint64_t ulongint;
enum settings_list
{
MARKOVLVL = 3, //level of markov chain
MAXTEXTSIZE = 10000, //maximal size of text
AVERAGESENTENCESIZE = 7, //average size of sentence
AVERAGELINESIZE = 15
};
/*
ulongint hash_coefficients[13]={1,37,1369,50653,1874161,69343957,2565726409,94931877133,3512479453921,
129961739795077,4808584372417849,177917621779460413,6582952005840035281};*/
ulongint hash_coefficients[13]={1, 31, 961, 29791, 923521, 28629151, 887503681, 27512614111, 852891037441,
26439622160671, 819628286980801, 25408476896404831, 787662783788549761};
//int settings[sizeof(settings_list)];
char* delimeter_list[9] = {" ", "...", "!", "?", ".", "?!", ":", ";", ","};
typedef struct suffix
{
char * text;
struct suffix * next; //pointer to the next suffix
uint suff_counter; //how many times this suffix appears
uint eos_counter; //how many times this prefix was an end of sentence
struct suffix * delimeter_list;
} suffix;
ulongint hash(char * str)
{
ulongint result = 0;
for(int i = 0; i < numlen; i++)
result += (str[i] + 1)*hash_coefficients[i%13];
return result;
}
ulongint prefix_hash(char * pref[MARKOVLVL])
{
ulongint result = 0;
for(int i = 0; i < MARKOVLVL; i++)
result += hash(pref[i])*(i+2);
return result;
}
void free_s_list(suffix ** suff)
{
if((*suff)->next == NULL)
{
(*suff)->text = NULL;
(*suff) = NULL;
return;
}
else
{
free_s_list(&((*suff)->next));
free_s_list(suff);
}
}
typedef struct prefix
{
ulongint key;
suffix * suffix_list;
uint sl_size; //size of suffix list
char * words[MARKOVLVL];
//uint counter; //how many times this prefix appears
}prefix ;
char * get_delimeter(char * str)
{
char * tmp;
for(int i = 1; i <= 8; i++)
{
tmp
= strstr(str
, delimeter_list
[i
]); return delimeter_list[i];
}
return delimeter_list[0]; //return space symbol
}
int get_delimeter_index(char * str)
{
char * tmp;
for(int i = 1; i <= 8; i++)
{
tmp
= strstr(str
, delimeter_list
[i
]); return i;
}
return 0; //return space symbol
}
char * remove_delimeter(char * str)
{
if(len == 0 || len == 1)
return str;
char * result = NULL;
char * gd = get_delimeter(str);
if(gd
!= NULL
&& strcmp(gd
," ") != 0) {
LOG
("Memory, allocated for result in REMOVE_DELIMETER: %d\n",len
-strlen(gd
)+1); /*for(int i = 0; i < len-strlen(gd); i++)
result[i] = str[i];*/
result
[len
-strlen(gd
)] = '\0'; }
else
{
if(str[0] == '\"' && str[len-1] == '\"')
{
result
= (char*) malloc(sizeof(char)*(len
-2+1)); for(int i = 1; i < len-1; i++)
result[i-1] = str[i];
result[len-2] = '\0';
}
else if(str[0] == '\"')
{
result
= (char*) malloc(sizeof(char)*(len
-1+1)); for(int i = 1; i <= len-1; i++)
result[i-1] = str[i];
result[len-1] = '\0';
}
else if(str[len-1] == '\"')
{
result
= (char*) malloc(sizeof(char)*(len
-1+1)); for(int i = 0; i < len-1; i++)
result[i] = str[i];
result[len-1] = '\0';
}
else
{
result
= (char*) malloc(sizeof(char)*(len
+1)); result[len] = '\0';
}
}
return result;
}
uint is_eos_delimeter(char * delim)
{
return 1;
else return 0;
}
char * lower(char * str) //locale-dependend function
{
if(str[0] <= 'Z' && str[0] >= 'A')
str[0] += 'a'-'A';
return str;
}
char * higher(char * str) //locale-dependend function
{
if(str[0] <= 'z' && str[0] >= 'A')
str[0] -= 'a'-'A';
return str;
}
prefix * create_prefix(char * pref[MARKOVLVL], char * suff)
{
LOG("IN CREATE_PREFIX FUNCTION\n");
LOG("Allocating memory for result\n");
prefix
* result
= (prefix
*) malloc(sizeof(prefix
)); char * temp;
temp = remove_delimeter(suff);
LOG("Allocating memory for suffix_list\n");
result
->suffix_list
= (suffix
*) malloc(sizeof(suffix
)); result->suffix_list->next = NULL;
//LOG("Allocating memory for suffix_list->text\n");
//result->suffix_list->text = (char*) malloc(strlen(temp)+1);
result->suffix_list->suff_counter = 1;
LOG("Allocating memory for suffix_list->delimeter_list\n");
result
->suffix_list
->delimeter_list
= (suffix
*) malloc(sizeof(suffix
)); result->suffix_list->delimeter_list->next = NULL;
result->suffix_list->delimeter_list->text = get_delimeter(suff);
result->suffix_list->delimeter_list->suff_counter = 1;
result->key = 0;
if(is_eos_delimeter(result->suffix_list->delimeter_list->text) == 1)
{
result->suffix_list->delimeter_list->eos_counter = 1;
result->suffix_list->eos_counter = 1;
}
/*if(is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])) == 0)
{
LOG("Copying LOWER suffix\n");
strcpy(result->suffix_list->text,lower(temp));
}
else
{*/
LOG("Copying suffix\n");
//strcpy(result->suffix_list->text,temp);
result->suffix_list->text = temp;
if(result->suffix_list->text[0] >= 'A' && result->suffix_list->text[0] <= 'Z')
result->suffix_list->text[0] += 'A'-'a';
//}
//LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
/*LOG("Freeing temp memory\n");
if(temp != NULL)
{
LOG("Temp memory address: %p\n", temp);
free(temp);
temp = NULL;
}
else
LOG("Problem with freeing\n");
LOG("Temp memory freed\n");*/
for(int i = 0; i < MARKOVLVL; i++)
{
//temp = remove_delimeter(lower(pref[i]));
temp = remove_delimeter(pref[i]);
//LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
LOG
("Allocating %d bytes of memory for result->words[%d]\n",strlen(temp
)+1, i
); if(temp != NULL && temp != pref[i])
{
result->words[i] = temp;
/*result->words[i] = (char*) malloc(strlen(temp)+1);
strcpy(result->words[i], temp);
LOG("Freeing temp memory\n");
free(temp);*/
}
else if(temp == NULL)
{
LOG("Something went wrong in CREATE_PREFIX\n");
}
else
{
strcpy(result
->words
[i
], pref
[i
]); }
if(result->words[i][0] >= 'A' && result->words[i][0] <= 'Z')
result->words[i][0] += 'A'-'a';
temp = NULL;
}
result->key = prefix_hash(result->words);
LOG("Prefix is:\n");
for(int i = 0; i < MARKOVLVL; i++)
LOG("%d: %s\n", i, result->words[i]);
LOG("Prefix key is:%llu\n", result->key);
//LOG("SUFFIX:%s\n",suff);
result->sl_size = 1;
LOG("Returning result\n");
return result;
}
void delete_prefix(prefix ** pref)
{
if((*pref)->suffix_list != NULL)
free_s_list(&((*pref)->suffix_list));
for(int i = 0; i < MARKOVLVL; i++)
{
if((*pref)->words[i] != NULL)
}
if((*pref) != NULL)
}
void add_delimeter(suffix * suff, int delim_index)
{
char * delim = delimeter_list[delim_index];
if(is_eos_delimeter(delim) == 1)
suff->eos_counter += 1;
suffix * dlist;
dlist = suff->delimeter_list;
if(dlist != NULL)
{
for(int i = 0; i < suff->suff_counter;)
{
if(strcmp(dlist
->text
, delim
) == 0) {
dlist->suff_counter +=1;
return;
}
i += dlist->suff_counter;
if(dlist->next == NULL)
{
dlist
->next
= (suffix
*) malloc(sizeof(suffix
)); dlist = dlist->next;
break;
}
}
}
else
{
dlist
= (suffix
*) malloc(sizeof(suffix
)); }
dlist->next = NULL;
suff->suff_counter += 1;
dlist->suff_counter = 1;
dlist->text = delim;
}
void add_suffix(prefix * pref, char * suff, int delim_index, int is_pref_eos)
{
suffix * suf = pref->suffix_list;
for(int i = 0; i < pref->sl_size;)
{
if(strcmp(suf
->text
, suff
) == 0) {
add_delimeter(suf, delim_index);
suf->suff_counter += 1;
pref->sl_size++;
return;
}
i+=suf->suff_counter;
if( suf->next == NULL)
{
suf
->next
=(suffix
*) malloc(sizeof(suffix
)); suf = suf->next;
break;
}
}
pref->sl_size++;
suf->next = NULL;
/*if(is_pref_eos == 1)
strcpy(suf->text, lower(suff));
else
strcpy(suf->text, suff);*/
if(suf->text[0] >= 'A' && suf->text[0] <= 'Z')
suf->text[0]+= 'A'-'a';
suf->suff_counter = 1;
}
suffix * get_random_suffix(suffix * suffix_list, int sl_size)
{
if(suffix_list == NULL ||sl_size == 0)
return NULL;
int randint
= rand() % sl_size
; suffix * suf = suffix_list;
for(int i = 0; i < sl_size;)
{
randint -= suf->suff_counter;
if(randint < 0 || suf->next == NULL)
break;
}
return suf;
}
prefix empty_prefix = {0,NULL,0,{NULL,NULL,NULL}};
typedef struct bt_node
{
struct bt_node * left;
struct bt_node * right;
//bt_node * parent;
//ulongint key;
prefix * value;
} bt_node;
bt_node * bt_add(bt_node * root, char * pref[MARKOVLVL], ulongint pref_hash, char * suff)
{
LOG("IN BT_ADD FUNCTION\n");
if(root == NULL)
{
root
= (bt_node
*) malloc(sizeof(bt_node
)); root->left = NULL;
root->right = NULL;
//root->parent = NULL;
//root->key = pref_hash;
LOG("Going to CREATE_PREFIX\n");
root->value = create_prefix(pref, suff);
}
else if(pref_hash < root->value->key)
{
root->left = bt_add(root->left, pref, pref_hash, suff);
}
else if(pref_hash > root->value->key)
{
root->right = bt_add(root->right, pref, pref_hash, suff);
}
else if(pref_hash == root->value->key)
{
LOG("Going to ADD_SUFFIX\n");
add_suffix(root->value, remove_delimeter(suff), get_delimeter_index(suff), is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])));
}
return root;
}
bt_node * bt_search(bt_node * root, ulongint key)
{
if(root!=NULL) LOG("inc:%llu cur:%llu >?:%d\n",key, root->value->key,key > root->value->key);
if(root == NULL)
return NULL;
else if(key < root->value->key)
return bt_search(root->left, key);
else if(key > root->value->key)
return bt_search(root->right,key);
else if(key == root->value->key)
return root;
return NULL;
}
void bt_delete(bt_node ** root)
{
if((*root)->left!=NULL)
bt_delete(&((*root)->left));
else if((*root)->right!=NULL)
bt_delete(&((*root)->right));
else
}
bool xor(bool a, bool b)
{
return (a&&!b)||(!a&&b);
}
prefix * bt_get_random_prefix(bt_node * root, ulongint rnd)
{
if(root->left == NULL && root->right == NULL)
return root->value;
//else if((root->key - rnd) == rnd || (root->key == rnd))
// return root->value;
else if(root
->right
!= NULL
&& (rnd
- rand() < rnd
|| root
->left
== NULL
) ) return bt_get_random_prefix(root->right, rnd);
else
return bt_get_random_prefix(root->left, rnd);
}
char * read_word(FILE * fd)
{
int max_size = 5, cur_size = 0;
char * result
=(char *) malloc(max_size
*sizeof(char)); while(ch != EOF && ch != '\n' && ch != ' ' && ch != '\t')
{
if(cur_size >= max_size)
{
max_size *= 2;
result
= (char * ) realloc(result
, max_size
*sizeof(char)); }
result[cur_size] = ch;
cur_size++;
}
if(cur_size == 0)
{
return NULL;
}
else
{
result
= (char*) realloc(result
, ((cur_size
+1)*sizeof(char))); result[cur_size] = '\0';
}
return result;
}
bt_node * generate_tree(FILE * fd)
{
bt_node * root = NULL;
char * pref[MARKOVLVL];
char * suf;
LOG("Initial fd reading\n");
for(int i = 0; i < MARKOVLVL; i++)
{
pref[i] = read_word(fd);
if(pref[i] == NULL)
{
LOG("Input file reading error. Exiting");
return NULL;
}
}
while(1)
{
LOG("Reading suffix\n");
suf = read_word(fd);
LOG("Suffix is %s\n", suf);
if(suf == NULL)
{
break;
else
continue;
}
LOG("Adding suffix to tree\n");
root = bt_add(root, pref, prefix_hash(pref), suf);
LOG("Suffix was succesfully added\n");
LOG("Freeing pref[0] memory\n");
pref[0] = NULL;
for(int i = 0; i < MARKOVLVL-1; i++)
{
pref[i] = pref[i+1];
}
/*for(int i = 0; i < MARKOVLVL; i++)
LOG("PREFIX[%d]: %s\n",i,pref[i]);*/
pref[MARKOVLVL-1] = suf;
}
for(int i = 0; i < MARKOVLVL; i++)
if(pref[i] != NULL)
return root;
}
void generate_text(bt_node * root, FILE * fd, uint text_size)
{
LOG("IN GENERATE_TEXT FUNCTION\n");
if(root == NULL)
return;
char * pref[MARKOVLVL];
LOG("Getting random prefix\n");
prefix
* cur_pref
= bt_get_random_prefix
(root
, rand()); pref[0] = cur_pref->words[0];
fprintf(fd
, "%c%s ", pref
[0][0]-'a'+'A', pref
[0]+1); for(int i = 1; i < MARKOVLVL; i++)
{
pref[i] = cur_pref->words[i];
}
LOG("Getting random suffix\n");
suffix * t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
LOG("Got random suffix\n");
char * suf = t_suf->text;
char * delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
uint ls_counter = MARKOVLVL+1; //current line size counter
uint txt_counter = MARKOVLVL+1; //current text size counter
uint sent_counter = MARKOVLVL+1;
uint eos_flag = 2;
ulongint key;
char t_char;
while(txt_counter < text_size && txt_counter < MAXTEXTSIZE)
{
if(ls_counter > AVERAGELINESIZE)
{
ls_counter = 1;
}
for(int i = 0; i < MARKOVLVL-1; i++)
{
pref[i] = pref[i+1];
}
pref[MARKOVLVL-1] = suf;
LOG("Searching for prefix\n");
key = prefix_hash(pref);
LOG("Prefix key is: %llu\n", key);
cur_pref = (bt_search(root, key))->value;
if(cur_pref == NULL)
{
LOG("Something went wrong");
}
LOG("Getting random suffix\n");
t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
suf = t_suf->text;
if(sent_counter > AVERAGESENTENCESIZE)
{
delim
= delimeter_list
[rand()%3 +2]; sent_counter = 0;
eos_flag = 0;
}
else
{
delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
if(is_eos_delimeter(delim) == 1)
{
sent_counter = 0;
eos_flag = 0;
}
else
eos_flag++;
}
t_char = suf[0];
if(eos_flag == 1)
t_char -= 'a'-'A';
fprintf(fd
, "%c%s ",t_char
, suf
+1); else
fprintf(fd
, "%c%s%s ",t_char
, suf
+1, delim
); txt_counter++;
ls_counter++;
sent_counter++;
LOG("Prefix was:\n");
for(int i = 0; i < MARKOVLVL; i++)
LOG("%d: %s\n", i, pref[i]);
LOG("Suffix was:\n%s\n",suf);
}
}
int main(void)
{
/*settings[MARKOVLVL] = 3;
settings[HASHSIZE] = 10000;
settings[MAXTEXTSIZE] = 10000;
settings[AVERAGESENTENCESIZE] = 7;*/
//char * somestr[MARKOVLVL] = {"some","shitty","string"};
//prefix * pref = create_prefix(somestr, "wow");
// add_suffix(pref, "wow");
// add_suffix(pref, "notwow");
// suffix* suff = pref->suffix_list;
// while(suff != NULL)
// {
// LOG("%s\n",suff->text);
// if(suff->next == NULL)
// break;
// else
// suff = suff->next;
// }
// delete_prefix(&pref);
/*FILE * fd = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\input_01.txt", "r");
FILE * fd1 = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\output_01.txt", "w");
if(fd == NULL)
{
LOG("Couldn\'t open file");
getchar();
exit(1);
}*/
LOG("Going to tree building\n");
bt_node * root = generate_tree(stdin);
LOG("Going to text generation\n");
FILE
* output
= fopen("./generated_text_02","w"); if(output == NULL)
{
LOG("Couldn\'t open file");
}
generate_text(root, output, 100);
bt_delete(&root);
/*char * str = (char *) malloc(7);
scanf("%s",str);
printf("%s\n",str);
str = lower(str);
printf("%s\n", str);
getchar();*/
return 0;
}