fork download
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <stdint.h>
  5. #include <stdbool.h>
  6. #include <time.h>
  7.  
  8. #define LOG(...) fprintf(stderr,__VA_ARGS__);
  9.  
  10. typedef unsigned int uint;
  11. typedef uint64_t ulongint;
  12.  
  13. enum settings_list
  14. {
  15. MARKOVLVL = 3, //level of markov chain
  16. MAXTEXTSIZE = 10000, //maximal size of text
  17. AVERAGESENTENCESIZE = 7, //average size of sentence
  18. AVERAGELINESIZE = 15
  19. };
  20. /*
  21. ulongint hash_coefficients[13]={1,37,1369,50653,1874161,69343957,2565726409,94931877133,3512479453921,
  22. 129961739795077,4808584372417849,177917621779460413,6582952005840035281};*/
  23.  
  24.  
  25. ulongint hash_coefficients[13]={1, 31, 961, 29791, 923521, 28629151, 887503681, 27512614111, 852891037441,
  26. 26439622160671, 819628286980801, 25408476896404831, 787662783788549761};
  27. //int settings[sizeof(settings_list)];
  28.  
  29. char* delimeter_list[9] = {" ", "...", "!", "?", ".", "?!", ":", ";", ","};
  30. typedef struct suffix
  31. {
  32. char * text;
  33. struct suffix * next; //pointer to the next suffix
  34. uint suff_counter; //how many times this suffix appears
  35. uint eos_counter; //how many times this prefix was an end of sentence
  36. struct suffix * delimeter_list;
  37. } suffix;
  38.  
  39. ulongint hash(char * str)
  40. {
  41. int numlen = strlen(str);
  42. ulongint result = 0;
  43. for(int i = 0; i < numlen; i++)
  44. result += (str[i] + 1)*hash_coefficients[i%13];
  45. return result;
  46. }
  47.  
  48. ulongint prefix_hash(char * pref[MARKOVLVL])
  49. {
  50. ulongint result = 0;
  51. for(int i = 0; i < MARKOVLVL; i++)
  52. result += hash(pref[i])*(i+2);
  53. return result;
  54. }
  55.  
  56. void free_s_list(suffix ** suff)
  57. {
  58. if((*suff)->next == NULL)
  59. {
  60. free((*suff)->text);
  61. (*suff)->text = NULL;
  62. free((*suff));
  63. (*suff) = NULL;
  64. return;
  65. }
  66. else
  67. {
  68. free_s_list(&((*suff)->next));
  69. free_s_list(suff);
  70. }
  71. }
  72.  
  73.  
  74. typedef struct prefix
  75. {
  76. ulongint key;
  77. suffix * suffix_list;
  78. uint sl_size; //size of suffix list
  79. char * words[MARKOVLVL];
  80. //uint counter; //how many times this prefix appears
  81. }prefix ;
  82.  
  83. char * get_delimeter(char * str)
  84. {
  85. char * tmp;
  86. for(int i = 1; i <= 8; i++)
  87. {
  88. tmp = strstr(str, delimeter_list[i]);
  89. if(tmp!=NULL && strlen(tmp) == strlen(delimeter_list[i]))
  90. return delimeter_list[i];
  91. }
  92. return delimeter_list[0]; //return space symbol
  93. }
  94.  
  95. int get_delimeter_index(char * str)
  96. {
  97. char * tmp;
  98. for(int i = 1; i <= 8; i++)
  99. {
  100. tmp = strstr(str, delimeter_list[i]);
  101. if(tmp!=NULL && strlen(tmp) == strlen(delimeter_list[i]))
  102. return i;
  103. }
  104. return 0; //return space symbol
  105. }
  106.  
  107. char * remove_delimeter(char * str)
  108. {
  109. int len = strlen(str);
  110. if(len == 0 || len == 1)
  111. return str;
  112. char * result = NULL;
  113. char * gd = get_delimeter(str);
  114. if(gd != NULL && strcmp(gd," ") != 0)
  115. {
  116. result = (char*) malloc(sizeof(char)*(len-strlen(gd)+1));
  117. LOG("Memory, allocated for result in REMOVE_DELIMETER: %d\n",len-strlen(gd)+1);
  118. strncpy(result, str, len-strlen(gd));
  119. /*for(int i = 0; i < len-strlen(gd); i++)
  120. result[i] = str[i];*/
  121. result[len-strlen(gd)] = '\0';
  122. }
  123. else
  124. {
  125. if(str[0] == '\"' && str[len-1] == '\"')
  126. {
  127. result = (char*) malloc(sizeof(char)*(len-2+1));
  128. for(int i = 1; i < len-1; i++)
  129. result[i-1] = str[i];
  130. result[len-2] = '\0';
  131. }
  132. else if(str[0] == '\"')
  133. {
  134. result = (char*) malloc(sizeof(char)*(len-1+1));
  135. for(int i = 1; i <= len-1; i++)
  136. result[i-1] = str[i];
  137. result[len-1] = '\0';
  138. }
  139. else if(str[len-1] == '\"')
  140. {
  141. result = (char*) malloc(sizeof(char)*(len-1+1));
  142. for(int i = 0; i < len-1; i++)
  143. result[i] = str[i];
  144. result[len-1] = '\0';
  145. }
  146. else
  147. {
  148. result = (char*) malloc(sizeof(char)*(len+1));
  149. strcpy(result, str);
  150. result[len] = '\0';
  151. }
  152. }
  153. return result;
  154. }
  155.  
  156. uint is_eos_delimeter(char * delim)
  157. {
  158. if(strcmp(delim,"!") == 0 || strcmp(delim,"?") == 0 ||
  159. strcmp(delim,".") == 0 || strcmp(delim,"?!") == 0 ||
  160. strcmp(delim,"...") == 0)
  161. return 1;
  162. else return 0;
  163. }
  164.  
  165. char * lower(char * str) //locale-dependend function
  166. {
  167. if(str[0] <= 'Z' && str[0] >= 'A')
  168. str[0] += 'a'-'A';
  169. return str;
  170. }
  171.  
  172. char * higher(char * str) //locale-dependend function
  173. {
  174. if(str[0] <= 'z' && str[0] >= 'A')
  175. str[0] -= 'a'-'A';
  176. return str;
  177. }
  178.  
  179. prefix * create_prefix(char * pref[MARKOVLVL], char * suff)
  180. {
  181. LOG("IN CREATE_PREFIX FUNCTION\n");
  182. LOG("Allocating memory for result\n");
  183. prefix * result = (prefix*) malloc(sizeof(prefix));
  184. char * temp;
  185. temp = remove_delimeter(suff);
  186. LOG("Allocating memory for suffix_list\n");
  187. result->suffix_list = (suffix*) malloc(sizeof(suffix));
  188. result->suffix_list->next = NULL;
  189. //LOG("Allocating memory for suffix_list->text\n");
  190. //result->suffix_list->text = (char*) malloc(strlen(temp)+1);
  191. result->suffix_list->suff_counter = 1;
  192. LOG("Allocating memory for suffix_list->delimeter_list\n");
  193. result->suffix_list->delimeter_list = (suffix*) malloc(sizeof(suffix));
  194. result->suffix_list->delimeter_list->next = NULL;
  195. result->suffix_list->delimeter_list->text = get_delimeter(suff);
  196. result->suffix_list->delimeter_list->suff_counter = 1;
  197. result->key = 0;
  198. if(is_eos_delimeter(result->suffix_list->delimeter_list->text) == 1)
  199. {
  200. result->suffix_list->delimeter_list->eos_counter = 1;
  201. result->suffix_list->eos_counter = 1;
  202. }
  203. /*if(is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])) == 0)
  204. {
  205. LOG("Copying LOWER suffix\n");
  206. strcpy(result->suffix_list->text,lower(temp));
  207. }
  208. else
  209. {*/
  210. LOG("Copying suffix\n");
  211. //strcpy(result->suffix_list->text,temp);
  212. result->suffix_list->text = temp;
  213. if(result->suffix_list->text[0] >= 'A' && result->suffix_list->text[0] <= 'Z')
  214. result->suffix_list->text[0] += 'A'-'a';
  215. //}
  216. //LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
  217. /*LOG("Freeing temp memory\n");
  218. if(temp != NULL)
  219. {
  220. LOG("Temp memory address: %p\n", temp);
  221. free(temp);
  222. temp = NULL;
  223. }
  224. else
  225. LOG("Problem with freeing\n");
  226. LOG("Temp memory freed\n");*/
  227. for(int i = 0; i < MARKOVLVL; i++)
  228. {
  229. //temp = remove_delimeter(lower(pref[i]));
  230. temp = remove_delimeter(pref[i]);
  231. //LOG("SUFFIX:%s\n TEMPORARY STRING:%s\n",suff,temp);
  232. LOG("Allocating %d bytes of memory for result->words[%d]\n",strlen(temp)+1, i);
  233. if(temp != NULL && temp != pref[i])
  234. {
  235. result->words[i] = temp;
  236. /*result->words[i] = (char*) malloc(strlen(temp)+1);
  237. strcpy(result->words[i], temp);
  238. LOG("Freeing temp memory\n");
  239. free(temp);*/
  240. }
  241. else if(temp == NULL)
  242. {
  243. LOG("Something went wrong in CREATE_PREFIX\n");
  244. exit(1);
  245. }
  246. else
  247. {
  248. result->words[i] = (char*) malloc(strlen(pref[i])+1);
  249. strcpy(result->words[i], pref[i]);
  250. }
  251. if(result->words[i][0] >= 'A' && result->words[i][0] <= 'Z')
  252. result->words[i][0] += 'A'-'a';
  253. temp = NULL;
  254. }
  255. result->key = prefix_hash(result->words);
  256. LOG("Prefix is:\n");
  257. for(int i = 0; i < MARKOVLVL; i++)
  258. LOG("%d: %s\n", i, result->words[i]);
  259. LOG("Prefix key is:%llu\n", result->key);
  260. //LOG("SUFFIX:%s\n",suff);
  261. result->sl_size = 1;
  262. LOG("Returning result\n");
  263. return result;
  264. }
  265.  
  266. void delete_prefix(prefix ** pref)
  267. {
  268. if((*pref)->suffix_list != NULL)
  269. free_s_list(&((*pref)->suffix_list));
  270. for(int i = 0; i < MARKOVLVL; i++)
  271. {
  272. if((*pref)->words[i] != NULL)
  273. free((*pref)->words[i]);
  274. }
  275. if((*pref) != NULL)
  276. free((*pref));
  277. }
  278.  
  279. void add_delimeter(suffix * suff, int delim_index)
  280. {
  281. char * delim = delimeter_list[delim_index];
  282. if(is_eos_delimeter(delim) == 1)
  283. suff->eos_counter += 1;
  284. suffix * dlist;
  285. dlist = suff->delimeter_list;
  286. if(dlist != NULL)
  287. {
  288. for(int i = 0; i < suff->suff_counter;)
  289. {
  290. if(strcmp(dlist->text, delim) == 0)
  291. {
  292. dlist->suff_counter +=1;
  293. return;
  294. }
  295. i += dlist->suff_counter;
  296. if(dlist->next == NULL)
  297. {
  298. dlist->next = (suffix*) malloc(sizeof(suffix));
  299. dlist = dlist->next;
  300. break;
  301. }
  302. }
  303. }
  304. else
  305. {
  306. dlist = (suffix*) malloc(sizeof(suffix));
  307. }
  308. dlist->next = NULL;
  309. suff->suff_counter += 1;
  310. dlist->suff_counter = 1;
  311. dlist->text = delim;
  312. }
  313.  
  314. void add_suffix(prefix * pref, char * suff, int delim_index, int is_pref_eos)
  315. {
  316. suffix * suf = pref->suffix_list;
  317. for(int i = 0; i < pref->sl_size;)
  318. {
  319. if(strcmp(suf->text, suff) == 0)
  320. {
  321. add_delimeter(suf, delim_index);
  322. suf->suff_counter += 1;
  323. pref->sl_size++;
  324. return;
  325. }
  326. i+=suf->suff_counter;
  327. if( suf->next == NULL)
  328. {
  329. suf->next =(suffix*) malloc(sizeof(suffix));
  330. suf = suf->next;
  331. break;
  332. }
  333. }
  334. pref->sl_size++;
  335. suf->next = NULL;
  336. suf->text = (char*) malloc(strlen(suff)+1);
  337. /*if(is_pref_eos == 1)
  338.   strcpy(suf->text, lower(suff));
  339.   else
  340.   strcpy(suf->text, suff);*/
  341. strcpy(suf->text, suff);
  342. if(suf->text[0] >= 'A' && suf->text[0] <= 'Z')
  343. suf->text[0]+= 'A'-'a';
  344. suf->suff_counter = 1;
  345. }
  346.  
  347. suffix * get_random_suffix(suffix * suffix_list, int sl_size)
  348. {
  349. if(suffix_list == NULL ||sl_size == 0)
  350. return NULL;
  351. int randint = rand() % sl_size;
  352. suffix * suf = suffix_list;
  353. for(int i = 0; i < sl_size;)
  354. {
  355. randint -= suf->suff_counter;
  356. if(randint < 0 || suf->next == NULL)
  357. break;
  358. }
  359. return suf;
  360. }
  361.  
  362. prefix empty_prefix = {0,NULL,0,{NULL,NULL,NULL}};
  363.  
  364. typedef struct bt_node
  365. {
  366. struct bt_node * left;
  367. struct bt_node * right;
  368. //bt_node * parent;
  369. //ulongint key;
  370. prefix * value;
  371. } bt_node;
  372.  
  373. bt_node * bt_add(bt_node * root, char * pref[MARKOVLVL], ulongint pref_hash, char * suff)
  374. {
  375. LOG("IN BT_ADD FUNCTION\n");
  376. if(root == NULL)
  377. {
  378. root = (bt_node*) malloc(sizeof(bt_node));
  379. root->left = NULL;
  380. root->right = NULL;
  381. //root->parent = NULL;
  382. //root->key = pref_hash;
  383. LOG("Going to CREATE_PREFIX\n");
  384. root->value = create_prefix(pref, suff);
  385. }
  386. else if(pref_hash < root->value->key)
  387. {
  388. root->left = bt_add(root->left, pref, pref_hash, suff);
  389. }
  390. else if(pref_hash > root->value->key)
  391. {
  392. root->right = bt_add(root->right, pref, pref_hash, suff);
  393. }
  394. else if(pref_hash == root->value->key)
  395. {
  396. LOG("Going to ADD_SUFFIX\n");
  397. add_suffix(root->value, remove_delimeter(suff), get_delimeter_index(suff), is_eos_delimeter(get_delimeter(pref[MARKOVLVL-1])));
  398. }
  399. return root;
  400. }
  401.  
  402. bt_node * bt_search(bt_node * root, ulongint key)
  403. {
  404. if(root!=NULL) LOG("inc:%llu cur:%llu >?:%d\n",key, root->value->key,key > root->value->key);
  405. if(root == NULL)
  406. return NULL;
  407. else if(key < root->value->key)
  408. return bt_search(root->left, key);
  409. else if(key > root->value->key)
  410. return bt_search(root->right,key);
  411. else if(key == root->value->key)
  412. return root;
  413. return NULL;
  414. }
  415.  
  416. void bt_delete(bt_node ** root)
  417. {
  418. if((*root)->left!=NULL)
  419. bt_delete(&((*root)->left));
  420. else if((*root)->right!=NULL)
  421. bt_delete(&((*root)->right));
  422. else
  423. free(*root);
  424. }
  425.  
  426. bool xor(bool a, bool b)
  427. {
  428. return (a&&!b)||(!a&&b);
  429. }
  430.  
  431. prefix * bt_get_random_prefix(bt_node * root, ulongint rnd)
  432. {
  433. if(root->left == NULL && root->right == NULL)
  434. return root->value;
  435. //else if((root->key - rnd) == rnd || (root->key == rnd))
  436. // return root->value;
  437. else if(root->right != NULL && (rnd - rand() < rnd || root->left == NULL) )
  438. return bt_get_random_prefix(root->right, rnd);
  439. else
  440. return bt_get_random_prefix(root->left, rnd);
  441. }
  442.  
  443. char * read_word(FILE * fd)
  444. {
  445. int max_size = 5, cur_size = 0;
  446. char * result =(char *) malloc(max_size*sizeof(char));
  447. char ch = fgetc(fd);
  448. while(ch != EOF && ch != '\n' && ch != ' ' && ch != '\t')
  449. {
  450. if(cur_size >= max_size)
  451. {
  452. max_size *= 2;
  453. result = (char * ) realloc(result, max_size*sizeof(char));
  454. }
  455. result[cur_size] = ch;
  456. cur_size++;
  457. ch = fgetc(fd);
  458. }
  459. if(cur_size == 0)
  460. {
  461. free(result);
  462. return NULL;
  463. }
  464. else
  465. {
  466. result = (char*) realloc(result, ((cur_size+1)*sizeof(char)));
  467. result[cur_size] = '\0';
  468. }
  469. return result;
  470. }
  471.  
  472. bt_node * generate_tree(FILE * fd)
  473. {
  474. bt_node * root = NULL;
  475. char * pref[MARKOVLVL];
  476. char * suf;
  477. LOG("Initial fd reading\n");
  478. for(int i = 0; i < MARKOVLVL; i++)
  479. {
  480. pref[i] = read_word(fd);
  481. if(pref[i] == NULL)
  482. {
  483. LOG("Input file reading error. Exiting");
  484. exit(1);
  485. return NULL;
  486. }
  487. }
  488. while(1)
  489. {
  490. LOG("Reading suffix\n");
  491. suf = read_word(fd);
  492. LOG("Suffix is %s\n", suf);
  493. if(suf == NULL)
  494. {
  495. if(feof(fd))
  496. break;
  497. else
  498. continue;
  499. }
  500. LOG("Adding suffix to tree\n");
  501. root = bt_add(root, pref, prefix_hash(pref), suf);
  502. LOG("Suffix was succesfully added\n");
  503. LOG("Freeing pref[0] memory\n");
  504. free(pref[0]);
  505. pref[0] = NULL;
  506. for(int i = 0; i < MARKOVLVL-1; i++)
  507. {
  508. pref[i] = pref[i+1];
  509. }
  510. /*for(int i = 0; i < MARKOVLVL; i++)
  511. LOG("PREFIX[%d]: %s\n",i,pref[i]);*/
  512. pref[MARKOVLVL-1] = suf;
  513. }
  514. for(int i = 0; i < MARKOVLVL; i++)
  515. if(pref[i] != NULL)
  516. free(pref[i]);
  517. free(suf);
  518. return root;
  519. }
  520.  
  521. void generate_text(bt_node * root, FILE * fd, uint text_size)
  522. {
  523. LOG("IN GENERATE_TEXT FUNCTION\n");
  524. if(root == NULL)
  525. return;
  526. char * pref[MARKOVLVL];
  527. LOG("Getting random prefix\n");
  528. prefix * cur_pref = bt_get_random_prefix(root, rand());
  529. pref[0] = cur_pref->words[0];
  530. fprintf(fd, "%c%s ", pref[0][0]-'a'+'A', pref[0]+1);
  531. for(int i = 1; i < MARKOVLVL; i++)
  532. {
  533. pref[i] = cur_pref->words[i];
  534. fprintf(fd, "%s ", pref[i]);
  535. }
  536. LOG("Getting random suffix\n");
  537. suffix * t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
  538. LOG("Got random suffix\n");
  539. char * suf = t_suf->text;
  540. char * delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
  541. uint ls_counter = MARKOVLVL+1; //current line size counter
  542. uint txt_counter = MARKOVLVL+1; //current text size counter
  543. uint sent_counter = MARKOVLVL+1;
  544. fprintf(fd, "%s ", suf);
  545. uint eos_flag = 2;
  546. ulongint key;
  547. char t_char;
  548. while(txt_counter < text_size && txt_counter < MAXTEXTSIZE)
  549. {
  550. if(ls_counter > AVERAGELINESIZE)
  551. {
  552. fprintf(fd, "\b\n");
  553. ls_counter = 1;
  554. }
  555. for(int i = 0; i < MARKOVLVL-1; i++)
  556. {
  557. pref[i] = pref[i+1];
  558. }
  559. pref[MARKOVLVL-1] = suf;
  560. LOG("Searching for prefix\n");
  561. key = prefix_hash(pref);
  562. LOG("Prefix key is: %llu\n", key);
  563. cur_pref = (bt_search(root, key))->value;
  564. if(cur_pref == NULL)
  565. {
  566. LOG("Something went wrong");
  567. exit(1);
  568. }
  569. LOG("Getting random suffix\n");
  570. t_suf = get_random_suffix(cur_pref->suffix_list, cur_pref->sl_size);
  571. suf = t_suf->text;
  572. if(sent_counter > AVERAGESENTENCESIZE)
  573. {
  574. delim = delimeter_list[rand()%3 +2];
  575. sent_counter = 0;
  576. eos_flag = 0;
  577. }
  578. else
  579. {
  580. delim = (get_random_suffix(t_suf->delimeter_list, t_suf->suff_counter))->text;
  581. if(is_eos_delimeter(delim) == 1)
  582. {
  583. sent_counter = 0;
  584. eos_flag = 0;
  585. }
  586. else
  587. eos_flag++;
  588. }
  589. t_char = suf[0];
  590. if(eos_flag == 1)
  591. t_char -= 'a'-'A';
  592. if(strcmp(delim, " ") == 0)
  593. fprintf(fd, "%c%s ",t_char, suf+1);
  594. else
  595. fprintf(fd, "%c%s%s ",t_char, suf+1, delim);
  596. txt_counter++;
  597. ls_counter++;
  598. sent_counter++;
  599. LOG("Prefix was:\n");
  600. for(int i = 0; i < MARKOVLVL; i++)
  601. LOG("%d: %s\n", i, pref[i]);
  602. LOG("Suffix was:\n%s\n",suf);
  603. }
  604.  
  605. }
  606.  
  607. int main(void)
  608. {
  609. /*settings[MARKOVLVL] = 3;
  610.   settings[HASHSIZE] = 10000;
  611.   settings[MAXTEXTSIZE] = 10000;
  612.   settings[AVERAGESENTENCESIZE] = 7;*/
  613. //char * somestr[MARKOVLVL] = {"some","shitty","string"};
  614. //prefix * pref = create_prefix(somestr, "wow");
  615. // add_suffix(pref, "wow");
  616. // add_suffix(pref, "notwow");
  617. // suffix* suff = pref->suffix_list;
  618. // while(suff != NULL)
  619. // {
  620. // LOG("%s\n",suff->text);
  621. // if(suff->next == NULL)
  622. // break;
  623. // else
  624. // suff = suff->next;
  625. // }
  626. // delete_prefix(&pref);
  627. srand(time(NULL));
  628. /*FILE * fd = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\input_01.txt", "r");
  629.   FILE * fd1 = fopen("C:\\Users\\sai\\Documents\\Visual Studio 2010\\Projects\\markov_chain\\markov_chain\\output_01.txt", "w");
  630.   if(fd == NULL)
  631.   {
  632.   LOG("Couldn\'t open file");
  633.   getchar();
  634.   exit(1);
  635.   }*/
  636. LOG("Going to tree building\n");
  637. bt_node * root = generate_tree(stdin);
  638. LOG("Going to text generation\n");
  639. FILE * output = fopen("./generated_text_02","w");
  640. if(output == NULL)
  641. {
  642. LOG("Couldn\'t open file");
  643. exit(1);
  644. }
  645. generate_text(root, output, 100);
  646. bt_delete(&root);
  647. fclose(output);
  648. /*char * str = (char *) malloc(7);
  649.   scanf("%s",str);
  650.   printf("%s\n",str);
  651.   str = lower(str);
  652.   printf("%s\n", str);
  653.   getchar();*/
  654. return 0;
  655. }
  656.  
Not running #stdin #stdout 0s 0KB
stdin
Standard input is empty
stdout
Standard output is empty