fork download
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <stdint.h>
  5. #include <stdbool.h>
  6. #include <time.h>
  7.  
  8. #define LOG(...) fprintf(stderr,__VA_ARGS__);
  9.  
  10. typedef unsigned int uint;
  11. typedef uint64_t ulongint;
  12.  
  13. enum settings_list
  14. {
  15. MARKOVLVL = 3, //level of markov chain
  16. MAXTEXTSIZE = 10000, //maximal size of text
  17. AVERAGESENTENCESIZE = 7, //average size of sentence
  18. AVERAGELINESIZE = 15
  19. };
  20. /*
  21. ulongint hash_coefficients[13]={1,37,1369,50653,1874161,69343957,2565726409,94931877133,3512479453921,
  22. 129961739795077,4808584372417849,177917621779460413,6582952005840035281};*/
  23.  
  24.  
  25. ulongint hash_coefficients[13]={1, 31, 961, 29791, 923521, 28629151, 887503681, 27512614111, 852891037441,
  26. 26439622160671, 819628286980801, 25408476896404831, 787662783788549761};
  27. //int settings[sizeof(settings_list)];
  28.  
  29. char* delimeter_list[9] = {" ", "...", "?!", "?", ".", "!", ":", ";", ","};
  30. typedef struct suffix
  31. {
  32. char * text;
  33. struct suffix * next; //pointer to the next suffix
  34. uint suff_counter; //how many times this suffix appears
  35. uint eos_counter; //how many times this prefix was an end of sentence
  36. struct suffix * delimeter_list;
  37. } suffix;
  38.  
  39. void free_s_list(suffix ** suff)
  40. {
  41. if((*suff)->next == NULL)
  42. {
  43. free((*suff)->text);
  44. (*suff)->text = NULL;
  45. free((*suff));
  46. (*suff) = NULL;
  47. return;
  48. }
  49. else
  50. {
  51. free_s_list(&((*suff)->next));
  52. free_s_list(suff);
  53. }
  54. }
  55.  
  56.  
  57. typedef struct prefix
  58. {
  59. suffix * suffix_list;
  60. uint sl_size; //size of suffix list
  61. char * words[MARKOVLVL];
  62. uint counter; //how many times this prefix appears
  63. }prefix ;
  64.  
  65. char * get_delimeter(char * str)
  66. {
  67. char * tmp;
  68. for(int i = 1; i <= 8; i++)
  69. {
  70. tmp = strstr(str, delimeter_list[i]);
  71. if(tmp!=NULL && strlen(tmp) == strlen(delimeter_list[i]))
  72. return delimeter_list[i];
  73. }
  74. return delimeter_list[0]; //return space symbol
  75. }
  76.  
  77. int get_delimeter_index(char * str)
  78. {
  79. char * tmp;
  80. for(int i = 1; i <= 8; i++)
  81. {
  82. tmp = strstr(str, delimeter_list[i]);
  83. if(tmp!=NULL && strlen(tmp) == strlen(delimeter_list[i]))
  84. return i;
  85. }
  86. return 0; //return space symbol
  87. }
  88.  
  89. char * remove_delimeter(char * str)
  90. {
  91. int len = strlen(str);
  92. if(len == 0 || len == 1)
  93. return str;
  94. char * result;
  95. char * gd = get_delimeter(str);
  96. if(gd != NULL && strcmp(gd," ") != 0)
  97. {
  98. result = (char*) malloc(sizeof(char)*(len-strlen(gd)+1));
  99. strncpy(result, str, len-strlen(gd));
  100. /*for(int i = 0; i < len-strlen(gd); i++)
  101. result[i] = str[i];*/
  102. result[len-strlen(gd)] = '\0';
  103. }
  104. else
  105. {
  106. if(str[0] == '\"' && str[len-1] == '\"')
  107. {
  108. result = (char*) malloc(sizeof(char)*(len-2+1));
  109. for(int i = 1; i < len-1; i++)
  110. result[i-1] = str[i];
  111. result[len-2] = '\0';
  112. }
  113. else if(str[0] == '\"')
  114. {
  115. result = (char*) malloc(sizeof(char)*(len-1+1));
  116. for(int i = 1; i <= len-1; i++)
  117. result[i-1] = str[i];
  118. result[len-1] = '\0';
  119. }
  120. else if(str[len-1] == '\"')
  121. {
  122. result = (char*) malloc(sizeof(char)*(len-1+1));
  123. for(int i = 0; i < len-1; i++)
  124. result[i] = str[i];
  125. result[len-1] = '\0';
  126. }
  127. else
  128. {
  129. result = (char*) malloc(sizeof(char)*(len+1));
  130. strcpy(result, str);
  131. result[len] = '\0';
  132. }
  133. }
  134. return result;
  135. }
  136.  
  137. uint is_eos_delimeter(char * delim)
  138. {
  139. if(strcmp(delim,"!") == 0 || strcmp(delim,"?") == 0 ||
  140. strcmp(delim,".") == 0 || strcmp(delim,"?!") == 0 ||
  141. strcmp(delim,"...") == 0)
  142. return 1;
  143. else return 0;
  144. }
  145.  
  146. char * lower(char * str) //locale-dependend function
  147. {
  148. if(str[0] <= 'Z' && str[0] >= 'A')
  149. str[0] += 'a'-'A';
  150. return str;
  151. }
  152.  
  153. char * higher(char * str) //locale-dependend function
  154. {
  155. if(str[0] <= 'z' && str[0] >= 'A')
  156. str[0] -= 'a'-'A';
  157. return str;
  158. }
  159.  
  160. prefix * create_prefix(char * pref[MARKOVLVL], char * suff)
  161. {
  162. LOG("IN CREATE_PREFIX FUNCTION\n");
  163. LOG("Allocating memory for result\n");
  164. prefix * result = (prefix*) malloc(sizeof(prefix));
  165. char * temp;
  166. temp = remove_delimeter(suff);
  167. LOG("Allocating memory for suffix_list\n");
  168. result->suffix_list = (suffix*) malloc(sizeof(suffix));
  169. result->suffix_list->next = NULL;
  170. LOG("Allocating memory for suffix_list->text\n");
  171. result->suffix_list->text = (char*) malloc(strlen(temp)+1);
  172. result->suffix_list->suff_counter = 1;
  173. LOG("Allocating memory for suffix_list->delimeter_list\n");
  174. result->suffix_list->delimeter_list = (suffix*) malloc(sizeof(suffix));
  175. result->suffix_list->delimeter_list->next = NULL;
  176. result->suffix_list->delimeter_list->text = get_delimeter(suff);
  177. result->suffix_list->delimeter_list->suff_counter = 1;
  178. if(is_eos_delimeter(result->suffix_list->delimeter_list->text) == 1)
  179. {
  180. result->suffix_list->delimeter_list->eos_counter = 1;
  181. result->suffix_list->eos_counter = 1;
  182. }
  183. //strcpy(result->suffix_list->text,lower(temp));
  184. strcpy(result->suffix_list->text,temp);
  185. /*if(is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])) == 0)
  186. {
  187. LOG("Copying LOWER suffix\n");
  188. strcpy(result->suffix_list->text,lower(temp));
  189. }
  190. else
  191. {
  192. LOG("Copying suffix\n");
  193. strcpy(result->suffix_list->text,temp);
  194. }*/
  195. //LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
  196. LOG("Freeing temp memory\n");
  197. if(temp != NULL)
  198. {
  199. LOG("Temp memory address: %p\n", temp);
  200. free(temp);
  201. temp = NULL;
  202. }
  203. else
  204. LOG("Problem with freeing\n");
  205. LOG("Temp memory freed\n");
  206. for(int i = 0; i < MARKOVLVL; i++)
  207. {
  208. //temp = remove_delimeter(lower(pref[i]));
  209. temp = remove_delimeter(pref[i]);
  210. //LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
  211. LOG("Allocating memory for result->words[%d]\n", i);
  212. result->words[i] = (char*) malloc(strlen(temp)+1);
  213. strcpy(result->words[i], temp);
  214. LOG("Freeing temp memory\n");
  215. free(temp);
  216. temp = NULL;
  217. }
  218. //LOG("SUFFIX:%s\n",suff);
  219. result->counter = 1;
  220. result->sl_size = 1;
  221. LOG("Returning result\n");
  222. return result;
  223. }
  224.  
  225. void delete_prefix(prefix ** pref)
  226. {
  227. if((*pref)->suffix_list != NULL)
  228. free_s_list(&((*pref)->suffix_list));
  229. for(int i = 0; i < MARKOVLVL; i++)
  230. {
  231. if((*pref)->words[i] != NULL)
  232. free((*pref)->words[i]);
  233. }
  234. if((*pref) != NULL)
  235. free((*pref));
  236. }
  237.  
  238. void add_delimeter(suffix * suff, int delim_index)
  239. {
  240. char * delim = delimeter_list[delim_index];
  241. if(is_eos_delimeter(delim) == 1)
  242. suff->eos_counter += 1;
  243. suffix * dlist;
  244. dlist = suff->delimeter_list;
  245. if(dlist != NULL)
  246. {
  247. for(int i = 0; i < suff->suff_counter;)
  248. {
  249. if(strcmp(dlist->text, delim) == 0)
  250. {
  251. dlist->suff_counter +=1;
  252. return;
  253. }
  254. i += dlist->suff_counter;
  255. if(dlist->next == NULL)
  256. {
  257. dlist->next = (suffix*) malloc(sizeof(suffix));
  258. dlist = dlist->next;
  259. break;
  260. }
  261. }
  262. }
  263. else
  264. {
  265. dlist = (suffix*) malloc(sizeof(suffix));
  266. }
  267. dlist->next = NULL;
  268. suff->suff_counter += 1;
  269. dlist->suff_counter = 1;
  270. dlist->text = delim;
  271. }
  272.  
  273. void add_suffix(prefix * pref, char * suff, int delim_index, int is_pref_eos)
  274. {
  275. suffix * suf = pref->suffix_list;
  276. for(int i = 0; i < pref->sl_size;)
  277. {
  278. if(strcmp(suf->text, suff) == 0)
  279. {
  280. add_delimeter(suf, delim_index);
  281. suf->suff_counter += 1;
  282. pref->sl_size++;
  283. return;
  284. }
  285. i+=suf->suff_counter;
  286. if( suf->next == NULL)
  287. {
  288. suf->next =(suffix*) malloc(sizeof(suffix));
  289. suf = suf->next;
  290. break;
  291. }
  292. }
  293. pref->sl_size++;
  294. suf->next = NULL;
  295. suf->text = (char*) malloc(strlen(suff)+1);
  296. /*if(is_pref_eos == 1)
  297.   strcpy(suf->text, lower(suff));
  298.   else
  299.   strcpy(suf->text, suff);*/
  300. strcpy(suf->text, suff);
  301. suf->suff_counter = 1;
  302. pref->counter += 1;
  303. }
  304.  
  305. suffix * get_random_suffix(suffix * suffix_list, int sl_size)
  306. {
  307. if(suffix_list == NULL ||sl_size == 0)
  308. return NULL;
  309. int randint = rand() % sl_size;
  310. suffix * suf = suffix_list;
  311. for(int i = 0; i < sl_size;)
  312. {
  313. randint -= suf->suff_counter;
  314. if(randint < 0 || suf->next == NULL)
  315. break;
  316. }
  317. return suf;
  318. }
  319.  
  320. prefix empty_prefix = {NULL,0,{NULL,NULL,NULL},0};
  321.  
  322. typedef struct bt_node
  323. {
  324. struct bt_node * left;
  325. struct bt_node * right;
  326. //bt_node * parent;
  327. ulongint key;
  328. prefix * value;
  329. } bt_node;
  330.  
  331. bt_node * bt_add(bt_node * root, char * pref[MARKOVLVL], ulongint pref_hash, char * suff)
  332. {
  333. LOG("IN BT_ADD FUNCTION\n");
  334. if(root == NULL)
  335. {
  336. root = (bt_node*) malloc(sizeof(bt_node));
  337. root->left = NULL;
  338. root->right = NULL;
  339. //root->parent = NULL;
  340. root->key = pref_hash;
  341. LOG("Going to CREATE_PREFIX\n");
  342. root->value = create_prefix(pref, suff);
  343. }
  344. else if(pref_hash < root->key)
  345. {
  346. root->left = bt_add(root->left, pref, pref_hash, suff);
  347. }
  348. else if(pref_hash > root->key)
  349. {
  350. root->right = bt_add(root->right, pref, pref_hash, suff);
  351. }
  352. else if(pref_hash == root->key)
  353. {
  354. LOG("Going to ADD_SUFFIX\n");
  355. add_suffix(root->value, remove_delimeter(suff), get_delimeter_index(suff), is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])));
  356. }
  357. return root;
  358. }
  359.  
  360. bt_node * bt_search(bt_node * root, ulongint key)
  361. {
  362. if(root == NULL)
  363. return NULL;
  364. else if(key < root->key)
  365. return bt_search(root->left, key);
  366. else if(key > root->key)
  367. return bt_search(root->right,key);
  368. else if(key == root->key)
  369. return root;
  370. return NULL;
  371. }
  372.  
  373. void bt_delete(bt_node ** root)
  374. {
  375. if((*root)->left!=NULL)
  376. bt_delete(&((*root)->left));
  377. else if((*root)->right!=NULL)
  378. bt_delete(&((*root)->right));
  379. else
  380. free(*root);
  381. }
  382.  
  383. bool xor(bool a, bool b)
  384. {
  385. return (a&&!b)||(!a&&b);
  386. }
  387.  
  388. prefix * bt_get_random_prefix(bt_node * root, ulongint rnd)
  389. {
  390. if(root->left == NULL && root->right == NULL)
  391. return root->value;
  392. //else if((root->key - rnd) == rnd || (root->key == rnd))
  393. // return root->value;
  394. else if(!xor(root->right != NULL , rnd - rand() < rnd) )
  395. return bt_get_random_prefix(root->right, rnd);
  396. else
  397. return bt_get_random_prefix(root->left, rnd);
  398. }
  399.  
  400. ulongint hash(char * str)
  401. {
  402. int numlen = strlen(str);
  403. ulongint result = 0;
  404. for(int i = 0; i < numlen; i++)
  405. result += (str[i] + 1)*hash_coefficients[i%13];
  406. return result;
  407. }
  408.  
  409. ulongint prefix_hash(char * pref[MARKOVLVL])
  410. {
  411. ulongint result = 0;
  412. for(int i = 0; i < MARKOVLVL; i++)
  413. result += hash(pref[i])*(i+2);
  414. return result;
  415. }
  416.  
  417. char * read_word(FILE * fd)
  418. {
  419. int max_size = 5, cur_size = 0;
  420. char * result =(char *) malloc(max_size*sizeof(char));
  421. char ch = fgetc(fd);
  422. while(ch != EOF && ch != '\n' && ch != ' ' && ch != '\t')
  423. {
  424. if(cur_size >= max_size)
  425. {
  426. max_size *= 2;
  427. result = (char * ) realloc(result, max_size*sizeof(char));
  428. }
  429. result[cur_size] = ch;
  430. cur_size++;
  431. ch = fgetc(fd);
  432. }
  433. if(cur_size == 0)
  434. {
  435. free(result);
  436. return NULL;
  437. }
  438. else
  439. {
  440. result = (char*) realloc(result, ((cur_size+1)*sizeof(char)));
  441. result[cur_size] = '\0';
  442. }
  443. return result;
  444. }
  445.  
  446. bt_node * generate_tree(FILE * fd)
  447. {
  448. bt_node * root = NULL;
  449. char * pref[MARKOVLVL];
  450. char * suf;
  451. char * temp;
  452. LOG("Initial fd reading\n");
  453. for(int i = 0; i < MARKOVLVL; i++)
  454. {
  455. pref[i] = read_word(fd);
  456. if(pref[i] == NULL)
  457. {
  458. LOG("Input file reading error. Exiting");
  459. exit(1);
  460. return NULL;
  461. }
  462. }
  463. while(1)
  464. {
  465. LOG("Reading suffix\n");
  466. suf = read_word(fd);
  467. LOG("Suffix is %s\n", suf);
  468. if(suf == NULL)
  469. {
  470. if(feof(fd))
  471. break;
  472. else
  473. continue;
  474. }
  475. LOG("Adding suffix to tree\n");
  476. root = bt_add(root, pref, prefix_hash(pref), suf);
  477. LOG("Suffix was succesfully added\n");
  478. LOG("Freeing pref[0] memory\n");
  479. free(pref[0]);
  480. for(int i = 0; i < MARKOVLVL-1; i++)
  481. {
  482. pref[i] = pref[i+1];
  483. }
  484. /*for(int i = 0; i < MARKOVLVL; i++)
  485. LOG("PREFIX[%d]: %s\n",i,pref[i]);*/
  486. LOG("Removing delimeter from suffix\n");
  487. temp = remove_delimeter(suf);
  488. pref[MARKOVLVL-1] = temp;
  489. LOG("Freeing suffix memory\n");
  490. free(suf);
  491. }
  492. for(int i = 0; i < MARKOVLVL; i++)
  493. if(pref[i] != NULL)
  494. free(pref[i]);
  495. free(suf);
  496. return root;
  497. }
  498.  
  499. void generate_text(bt_node * root, FILE * fd, uint text_size)
  500. {
  501. LOG("IN GENERATE_TEXT FUNCTION\n");
  502. if(root == NULL)
  503. return;
  504. char * pref[MARKOVLVL];
  505. LOG("Getting random prefix\n");
  506. prefix * cur_pref = bt_get_random_prefix(root, rand());
  507. for(int i = 0; i < MARKOVLVL; i++)
  508. {
  509. pref[i] = cur_pref->words[i];
  510. fprintf(fd, "%s ", pref[i]);
  511. }
  512. LOG("Getting random suffix\n");
  513. suffix * t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
  514. LOG("Got random suffix\n");
  515. char * suf = t_suf->text;
  516. char * delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
  517. uint ls_counter = MARKOVLVL+1; //current line size counter
  518. uint txt_counter = MARKOVLVL+1; //current text size counter
  519. uint sent_counter = MARKOVLVL+1;
  520. fprintf(fd, "%s ", suf);
  521. uint eos_flag = 0;
  522. while(txt_counter < text_size && txt_counter < MAXTEXTSIZE)
  523. {
  524. if(ls_counter > AVERAGELINESIZE)
  525. {
  526. fprintf(fd, "\b\n");
  527. ls_counter = 1;
  528. }
  529. for(int i = 0; i < MARKOVLVL-1; i++)
  530. {
  531. pref[i] = pref[i+1];
  532. }
  533. pref[MARKOVLVL-1] = suf;
  534. LOG("Searching for prefix\n");
  535. cur_pref = (bt_search(root, prefix_hash(pref)))->value;
  536. LOG("Getting random suffix\n");
  537. t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
  538. suf = t_suf->text;
  539. if(sent_counter > AVERAGESENTENCESIZE)
  540. {
  541. delim = delimeter_list[rand()%5 +1];
  542. sent_counter = 0;
  543. eos_flag = 0;
  544. }
  545. else
  546. {
  547. delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
  548. if(is_eos_delimeter(delim) == 1)
  549. {
  550. sent_counter = 0;
  551. eos_flag = 0;
  552. }
  553. else
  554. eos_flag++;
  555. }/*
  556. if(eos_flag == 1)
  557. {
  558. if(strcmp(delim, " ") == 0)
  559. fprintf(fd, "%s ", higher(suf));
  560. else
  561. fprintf(fd, "%s%s ", higher(suf), delim);
  562. }
  563. else
  564. {*/
  565. if(strcmp(delim, " ") == 0)
  566. fprintf(fd, "%s ", suf);
  567. else
  568. fprintf(fd, "%s%s ", suf, delim);
  569. //}
  570. txt_counter++;
  571. ls_counter++;
  572. sent_counter++;
  573. }
  574.  
  575. }
  576.  
  577. int main(void)
  578. {
  579. /*settings[MARKOVLVL] = 3;
  580.   settings[HASHSIZE] = 10000;
  581.   settings[MAXTEXTSIZE] = 10000;
  582.   settings[AVERAGESENTENCESIZE] = 7;*/
  583. //char * somestr[MARKOVLVL] = {"some","shitty","string"};
  584. //prefix * pref = create_prefix(somestr, "wow");
  585. // add_suffix(pref, "wow");
  586. // add_suffix(pref, "notwow");
  587. // suffix* suff = pref->suffix_list;
  588. // while(suff != NULL)
  589. // {
  590. // LOG("%s\n",suff->text);
  591. // if(suff->next == NULL)
  592. // break;
  593. // else
  594. // suff = suff->next;
  595. // }
  596. // delete_prefix(&pref);
  597. srand(time(NULL));
  598. /*FILE * fd = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\input_01.txt", "r");
  599.   FILE * fd1 = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\output_01.txt", "w");
  600.   if(fd == NULL)
  601.   {
  602.   LOG("Couldn\'t open file");
  603.   getchar();
  604.   exit(1);
  605.   }*/
  606. LOG("Going to tree building\n");
  607. bt_node * root = generate_tree(stdin);
  608. LOG("Going to text generation\n");
  609. FILE * output = fopen("./generated_text_02","w");
  610. if(output == NULL)
  611. {
  612. LOG("Couldn\'t open file");
  613. getchar();
  614. exit(1);
  615. }
  616. generate_text(root, output, 100);
  617. bt_delete(&root);
  618. fclose(output);
  619. /*char * str = (char *) malloc(7);
  620.   scanf("%s",str);
  621.   printf("%s\n",str);
  622.   str = lower(str);
  623.   printf("%s\n", str);
  624.   getchar();*/
  625. getchar();
  626. return 0;
  627. }
  628.  
Not running #stdin #stdout 0s 0KB
stdin
Standard input is empty
stdout
Standard output is empty