#include <cassert>
#include <cctype>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <unordered_map>
static const uint32_t small_rus_io = 0x451;
static const uint32_t capital_rus_io = 0x401;
static const uint32_t small_rus_ie = 0x435;
static const uint32_t capital_rus_ie = 0x415;
class utf8_iterator final
{
public:
explicit utf8_iterator(const char* text)
: text_(text)
, current_(text)
, bytes_count_(text ? get_bytes_count(*text) : 0)
, code_point_(0)
{
}
const char* first_byte() const
{
return current_;
}
size_t bytes_count() const
{
return bytes_count_;
}
bool has_char() const
{
return current_ && current_[0] != 0;
}
uint32_t code_point() const
{
if (code_point_ != 0)
return code_point_;
switch (bytes_count_)
{
case 1:
code_point_ = *current_;
break;
case 2:
code_point_ = ((current_[0] & 0x1F) << 6) | (current_[1] & 0x3F);
break;
default:
//TODO: not implemented yet
;
}
return code_point_;
}
void operator++()
{
if (!has_char())
return;
current_ += bytes_count_;
bytes_count_ = get_bytes_count(*current_);
code_point_ = 0;
}
private:
static size_t get_bytes_count(char first_char)
{
auto c = *reinterpret_cast<unsigned char*>(&first_char);
if (c == 0) return 0;
if (c <= 0x7F) return 1;
if ((c & 0xE0) == 0xC0) return 2;
if ((c & 0xF0) == 0xE0) return 3;
if ((c & 0xF8) == 0xF0) return 4;
if ((c & 0xFC) == 0xF8) return 5;
if ((c & 0xFE) == 0xFC) return 6;
return 0;
}
private:
const char* text_;
const char* current_;
size_t bytes_count_;
mutable uint32_t code_point_;
};
bool is_english_char(const utf8_iterator& it)
{
return it.bytes_count() == 1 && std::isalpha(it.first_byte()[0]);
}
bool is_russian_char(const utf8_iterator& it)
{
const auto code = it.code_point();
return (code >= 0x410 && code <= 0x44F) || (code == capital_rus_io || code == small_rus_io);
}
void english_to_lower_and_append(const utf8_iterator& it, std::string& out)
{
out += std::tolower(it.first_byte()[0]);
}
void russian_to_lower_and_append(const utf8_iterator& it, std::string& out)
{
const auto code = it.code_point();
auto new_code = code;
if (code >= 0x410 && code <= 0x42F)
new_code += (0x42F - 0x410 + 1);
else if (code == 0x401)
new_code = 0x451;
if (new_code != code)
{
out += (new_code >> 6) | 0xC0;
out += (new_code & 0x3F) | 0x80;
}
else
{
out += it.first_byte()[0];
out += it.first_byte()[1];
}
}
std::string get_word(utf8_iterator& current)
{
using char_detector_t = std::function<bool(const utf8_iterator&)>;
char_detector_t is_allowable_char = nullptr;
using char_converter_t = std::function<void(const utf8_iterator&, std::string&)>;
char_converter_t to_lower_and_append = nullptr;
while (current.has_char())
{
if (is_english_char(current))
{
is_allowable_char = is_english_char;
to_lower_and_append = english_to_lower_and_append;
break;
}
else if (is_russian_char(current))
{
is_allowable_char = is_russian_char;
to_lower_and_append = russian_to_lower_and_append;
break;
}
// looking for char
++current;
}
std::string word;
if (!is_allowable_char)
return word;
do
{
to_lower_and_append(current, word);
++current;
}
while (is_allowable_char(current));
return word;
}
struct utf8_comparer final
{
public:
bool operator()(const char* left, const char* right) const
{
utf8_iterator left_iter(left);
const bool left_is_russian = is_russian_char(left_iter);
utf8_iterator right_iter(right);
const bool right_is_russian = is_russian_char(right_iter);
if (left_is_russian && right_is_russian)
{
while (left_iter.has_char() && right_iter.has_char())
{
const auto left_code = left_iter.code_point();
const auto right_code = right_iter.code_point();
if (left_code == small_rus_io && right_code != left_code)
return right_code >= small_rus_ie;
if (left_code == right_code)
{
++left_iter;
++right_iter;
}
else
{
return left_code < right_code;
}
}
return !left_iter.has_char() && right_iter.has_char();
}
return strcmp(left, right) < 0;
}
};
#ifdef TESTING_ENABLED
void test_is_russian()
{
assert(is_russian_char(utf8_iterator("а")));
assert(is_russian_char(utf8_iterator("А")));
assert(is_russian_char(utf8_iterator("ё")));
assert(is_russian_char(utf8_iterator("Ё")));
assert(is_russian_char(utf8_iterator("я")));
assert(is_russian_char(utf8_iterator("Я")));
assert(!is_russian_char(utf8_iterator("q")));
assert(!is_russian_char(utf8_iterator("1")));
}
bool is_equal_as_lower(const char* left, const char* right)
{
std::string buf;
russian_to_lower_and_append(utf8_iterator(left), buf);
return strcmp(buf.c_str(), right) == 0;
}
void test_russian_to_lower()
{
assert(is_equal_as_lower("А", "а"));
assert(is_equal_as_lower("Я", "я"));
assert(is_equal_as_lower("ф", "ф"));
assert(is_equal_as_lower("Ё", "ё"));
assert(is_equal_as_lower("ё", "ё"));
}
void test_comparer()
{
assert(utf8_comparer()("a", "b"));
assert(utf8_comparer()("aa", "ab"));
assert(!utf8_comparer()("a", "a"));
assert(utf8_comparer()("б", "в"));
assert(utf8_comparer()("е", "ё"));
assert(utf8_comparer()("ё", "ж"));
assert(utf8_comparer()("b", "б"));
}
#endif
int main(int argc, char* argv[])
{
#ifdef TESTING_ENABLED
test_is_russian();
test_russian_to_lower();
test_comparer();
#endif
if (argc != 3)
{
std::cerr << "words counter\n";
std::cerr << "usage:\n";
std::cerr << " freqs in_file out_file\n";
return EXIT_FAILURE;
}
const auto in_file_name = argv[1];
std::ifstream in(in_file_name, std::ios::binary);
if (!in)
{
std::cerr << "can't open " << in_file_name << '\n';
return EXIT_FAILURE;
}
std::unordered_map<std::string, size_t> count_by_word;
std::string line;
while (std::getline(in, line))
{
auto it = utf8_iterator(line.c_str());
while (it.has_char())
{
const auto word = get_word(it);
if (!word.empty())
++count_by_word[word];
}
}
const auto out_file_name = argv[2];
std::ofstream out(out_file_name, std::ios::binary);
if (!out)
{
std::cerr << "can't open " << out_file_name << '\n';
return EXIT_FAILURE;
}
using sorted_words_t = std::set<const char*, utf8_comparer>;
std::map<size_t, sorted_words_t> sorted_words_by_count;
for (auto& it : count_by_word)
{
const auto count = it.second;
const auto word = it.first.c_str();
sorted_words_by_count[count].insert(word);
}
for (auto it = sorted_words_by_count.rbegin(), end = sorted_words_by_count.rend(); it != end; ++it)
{
const auto count = it->first;
for (auto word : it->second)
out << count << ' ' << word << '\n';
}
return EXIT_SUCCESS;
}