fork download
  1. #include <cassert>
  2. #include <cctype>
  3. #include <cstring>
  4. #include <fstream>
  5. #include <functional>
  6. #include <iostream>
  7. #include <map>
  8. #include <set>
  9. #include <string>
  10. #include <unordered_map>
  11.  
  12. static const uint32_t small_rus_io = 0x451;
  13. static const uint32_t capital_rus_io = 0x401;
  14.  
  15. static const uint32_t small_rus_ie = 0x435;
  16. static const uint32_t capital_rus_ie = 0x415;
  17.  
  18. class utf8_iterator final
  19. {
  20. public:
  21. explicit utf8_iterator(const char* text)
  22. : text_(text)
  23. , current_(text)
  24. , bytes_count_(text ? get_bytes_count(*text) : 0)
  25. , code_point_(0)
  26. {
  27. }
  28.  
  29. const char* first_byte() const
  30. {
  31. return current_;
  32. }
  33.  
  34. size_t bytes_count() const
  35. {
  36. return bytes_count_;
  37. }
  38.  
  39. bool has_char() const
  40. {
  41. return current_ && current_[0] != 0;
  42. }
  43.  
  44. uint32_t code_point() const
  45. {
  46. if (code_point_ != 0)
  47. return code_point_;
  48.  
  49. switch (bytes_count_)
  50. {
  51. case 1:
  52. code_point_ = *current_;
  53. break;
  54. case 2:
  55. code_point_ = ((current_[0] & 0x1F) << 6) | (current_[1] & 0x3F);
  56. break;
  57. default:
  58. //TODO: not implemented yet
  59. ;
  60. }
  61.  
  62. return code_point_;
  63. }
  64.  
  65. void operator++()
  66. {
  67. if (!has_char())
  68. return;
  69.  
  70. current_ += bytes_count_;
  71. bytes_count_ = get_bytes_count(*current_);
  72. code_point_ = 0;
  73. }
  74.  
  75. private:
  76. static size_t get_bytes_count(char first_char)
  77. {
  78. auto c = *reinterpret_cast<unsigned char*>(&first_char);
  79. if (c == 0) return 0;
  80. if (c <= 0x7F) return 1;
  81. if ((c & 0xE0) == 0xC0) return 2;
  82. if ((c & 0xF0) == 0xE0) return 3;
  83. if ((c & 0xF8) == 0xF0) return 4;
  84. if ((c & 0xFC) == 0xF8) return 5;
  85. if ((c & 0xFE) == 0xFC) return 6;
  86. return 0;
  87. }
  88.  
  89. private:
  90. const char* text_;
  91. const char* current_;
  92. size_t bytes_count_;
  93. mutable uint32_t code_point_;
  94. };
  95.  
  96. bool is_english_char(const utf8_iterator& it)
  97. {
  98. return it.bytes_count() == 1 && std::isalpha(it.first_byte()[0]);
  99. }
  100.  
  101. bool is_russian_char(const utf8_iterator& it)
  102. {
  103. const auto code = it.code_point();
  104. return (code >= 0x410 && code <= 0x44F) || (code == capital_rus_io || code == small_rus_io);
  105. }
  106.  
  107. void english_to_lower_and_append(const utf8_iterator& it, std::string& out)
  108. {
  109. out += std::tolower(it.first_byte()[0]);
  110. }
  111.  
  112. void russian_to_lower_and_append(const utf8_iterator& it, std::string& out)
  113. {
  114. const auto code = it.code_point();
  115. auto new_code = code;
  116.  
  117. if (code >= 0x410 && code <= 0x42F)
  118. new_code += (0x42F - 0x410 + 1);
  119. else if (code == 0x401)
  120. new_code = 0x451;
  121.  
  122.  
  123. if (new_code != code)
  124. {
  125. out += (new_code >> 6) | 0xC0;
  126. out += (new_code & 0x3F) | 0x80;
  127. }
  128. else
  129. {
  130. out += it.first_byte()[0];
  131. out += it.first_byte()[1];
  132. }
  133. }
  134.  
  135. std::string get_word(utf8_iterator& current)
  136. {
  137. using char_detector_t = std::function<bool(const utf8_iterator&)>;
  138. char_detector_t is_allowable_char = nullptr;
  139.  
  140. using char_converter_t = std::function<void(const utf8_iterator&, std::string&)>;
  141. char_converter_t to_lower_and_append = nullptr;
  142.  
  143. while (current.has_char())
  144. {
  145. if (is_english_char(current))
  146. {
  147. is_allowable_char = is_english_char;
  148. to_lower_and_append = english_to_lower_and_append;
  149. break;
  150. }
  151. else if (is_russian_char(current))
  152. {
  153. is_allowable_char = is_russian_char;
  154. to_lower_and_append = russian_to_lower_and_append;
  155. break;
  156. }
  157.  
  158. // looking for char
  159. ++current;
  160. }
  161.  
  162. std::string word;
  163.  
  164. if (!is_allowable_char)
  165. return word;
  166.  
  167. do
  168. {
  169. to_lower_and_append(current, word);
  170. ++current;
  171. }
  172. while (is_allowable_char(current));
  173.  
  174. return word;
  175. }
  176.  
  177. struct utf8_comparer final
  178. {
  179. public:
  180. bool operator()(const char* left, const char* right) const
  181. {
  182. utf8_iterator left_iter(left);
  183. const bool left_is_russian = is_russian_char(left_iter);
  184.  
  185. utf8_iterator right_iter(right);
  186. const bool right_is_russian = is_russian_char(right_iter);
  187.  
  188. if (left_is_russian && right_is_russian)
  189. {
  190. while (left_iter.has_char() && right_iter.has_char())
  191. {
  192. const auto left_code = left_iter.code_point();
  193. const auto right_code = right_iter.code_point();
  194.  
  195. if (left_code == small_rus_io && right_code != left_code)
  196. return right_code >= small_rus_ie;
  197.  
  198. if (left_code == right_code)
  199. {
  200. ++left_iter;
  201. ++right_iter;
  202. }
  203. else
  204. {
  205. return left_code < right_code;
  206. }
  207. }
  208.  
  209. return !left_iter.has_char() && right_iter.has_char();
  210. }
  211.  
  212. return strcmp(left, right) < 0;
  213. }
  214. };
  215.  
  216. #ifdef TESTING_ENABLED
  217. void test_is_russian()
  218. {
  219. assert(is_russian_char(utf8_iterator("а")));
  220. assert(is_russian_char(utf8_iterator("А")));
  221. assert(is_russian_char(utf8_iterator("ё")));
  222. assert(is_russian_char(utf8_iterator("Ё")));
  223. assert(is_russian_char(utf8_iterator("я")));
  224. assert(is_russian_char(utf8_iterator("Я")));
  225. assert(!is_russian_char(utf8_iterator("q")));
  226. assert(!is_russian_char(utf8_iterator("1")));
  227. }
  228.  
  229. bool is_equal_as_lower(const char* left, const char* right)
  230. {
  231. std::string buf;
  232. russian_to_lower_and_append(utf8_iterator(left), buf);
  233. return strcmp(buf.c_str(), right) == 0;
  234. }
  235.  
  236. void test_russian_to_lower()
  237. {
  238. assert(is_equal_as_lower("А", "а"));
  239. assert(is_equal_as_lower("Я", "я"));
  240. assert(is_equal_as_lower("ф", "ф"));
  241. assert(is_equal_as_lower("Ё", "ё"));
  242. assert(is_equal_as_lower("ё", "ё"));
  243. }
  244.  
  245. void test_comparer()
  246. {
  247. assert(utf8_comparer()("a", "b"));
  248. assert(utf8_comparer()("aa", "ab"));
  249. assert(!utf8_comparer()("a", "a"));
  250. assert(utf8_comparer()("б", "в"));
  251. assert(utf8_comparer()("е", "ё"));
  252. assert(utf8_comparer()("ё", "ж"));
  253. assert(utf8_comparer()("b", "б"));
  254. }
  255. #endif
  256.  
  257. int main(int argc, char* argv[])
  258. {
  259. #ifdef TESTING_ENABLED
  260. test_is_russian();
  261. test_russian_to_lower();
  262. test_comparer();
  263. #endif
  264.  
  265. if (argc != 3)
  266. {
  267. std::cerr << "words counter\n";
  268. std::cerr << "usage:\n";
  269. std::cerr << " freqs in_file out_file\n";
  270. return EXIT_FAILURE;
  271. }
  272.  
  273. const auto in_file_name = argv[1];
  274.  
  275. std::ifstream in(in_file_name, std::ios::binary);
  276. if (!in)
  277. {
  278. std::cerr << "can't open " << in_file_name << '\n';
  279. return EXIT_FAILURE;
  280. }
  281.  
  282. std::unordered_map<std::string, size_t> count_by_word;
  283.  
  284. std::string line;
  285. while (std::getline(in, line))
  286. {
  287. auto it = utf8_iterator(line.c_str());
  288. while (it.has_char())
  289. {
  290. const auto word = get_word(it);
  291. if (!word.empty())
  292. ++count_by_word[word];
  293. }
  294. }
  295.  
  296. const auto out_file_name = argv[2];
  297.  
  298. std::ofstream out(out_file_name, std::ios::binary);
  299. if (!out)
  300. {
  301. std::cerr << "can't open " << out_file_name << '\n';
  302. return EXIT_FAILURE;
  303. }
  304.  
  305. using sorted_words_t = std::set<const char*, utf8_comparer>;
  306. std::map<size_t, sorted_words_t> sorted_words_by_count;
  307.  
  308. for (auto& it : count_by_word)
  309. {
  310. const auto count = it.second;
  311. const auto word = it.first.c_str();
  312. sorted_words_by_count[count].insert(word);
  313. }
  314.  
  315. for (auto it = sorted_words_by_count.rbegin(), end = sorted_words_by_count.rend(); it != end; ++it)
  316. {
  317. const auto count = it->first;
  318. for (auto word : it->second)
  319. out << count << ' ' << word << '\n';
  320. }
  321.  
  322. return EXIT_SUCCESS;
  323. }
  324.  
Runtime error #stdin #stdout #stderr 0s 16080KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
words counter
usage:
  freqs in_file out_file