// ExternalSorting.cpp : Defines the entry point for the console application.
//
//#include "stdafx.h"
#include <string>
#include <sstream>
#include <iostream>
#include <fstream>
#include <list>
#include <algorithm>
#include <iterator>
#include <vector>
#include <chrono>
using StringContainer = std::list<std::string>;
struct TBuffer
{
private:
void _reload()
{
bEOF = !static_cast<bool>(std::getline(_input_stream, _curString, '\n'));
}
public:
template <class T>
TBuffer(T&& fileName)
:_input_stream(std::forward<T>(fileName))
{
_reload();
}
inline void operator++(int)
{
_reload();
}
~TBuffer()
{
_input_stream.close();
}
std::ifstream _input_stream;
std::string _curString;
bool bEOF = false;
};
using BufferContainer = std::list<TBuffer>;
std::string __merge2Files(const std::string& filename1, const std::string& filename2)
{
std::string output = filename1 + ".0";
if (true)
{
TBuffer buf1(filename1);
TBuffer buf2(filename2);
std::ofstream ofs(output);
while (!buf1.bEOF && !buf2.bEOF)
{
if (buf1._curString < buf2._curString)
{
ofs << buf1._curString << '\n';
buf1++;
}
else
{
ofs << buf2._curString << '\n';
buf2++;
}
}
// write any tail
while (!buf1.bEOF)
{
ofs << buf1._curString;
buf1++;
if (!buf1.bEOF)
ofs << '\n';
}
while (!buf2.bEOF)
{
ofs << buf2._curString;
buf2++;
if (!buf2.bEOF)
ofs << '\n';
}
ofs.close();
}
// delete old files
std::remove(filename1.c_str());
std::remove(filename2.c_str());
return output;
}
std::string __mergeFiles(StringContainer filelist)
{
if (filelist.size() == 2)
{
return __merge2Files(filelist.front(), filelist.back());
}
if (filelist.size() == 1)
{
return filelist.front(); //already merged
}
while (filelist.size() > 1)
{
std::string fileName1 = filelist.front();
filelist.pop_front();
std::string fileName2 = filelist.front();
filelist.pop_front();
filelist.push_back(__merge2Files(fileName1, fileName2));
}
return filelist.front();
}
std::string MergeFiles(StringContainer filelist)
{
return __mergeFiles(std::move(filelist));
}
template<typename Function, typename... Args>
int64_t do_profiling(Function&& _func, const Args&...args)
{
using namespace std::chrono;
auto start = high_resolution_clock::now();
_func(args...);
return duration_cast<milliseconds>(high_resolution_clock::now() - start).count();
}
template <class T>
auto GetFileSize(T&& filename)
{
std::fstream ifs(std::forward<T>(filename), std::fstream::ios_base::in | std::ios::binary | std::fstream::ios_base::ate);
auto size = ifs.tellg();
ifs.close();
return size;
}
int main(int argc, char* argv[])
{
try
{
// Check command line arguments.
if (argc != 5)
{
std::cerr << "Usage: extsort.exe <input file name> <output file name> <memory size> <temp folder>\n";
std::cerr << " For example, try:\n";
std::cerr << " extsort.exe \"c:\\in.txt\" \"c:\\out.txt\" 1000000 \"c:\\temp\\\"\n";
return 1;
}
std::string inFileName = argv[1];
std::string outFilename = argv[2];
unsigned long memorySize = std::stoul(std::string(argv[3]), nullptr, 10);
std::string tempFolder = argv[4];
auto fileSize = GetFileSize(inFileName);
if (!fileSize)
{
std::cout << "Input file empty" << std::endl;
return 0;
}
std::ifstream ifs(inFileName);
StringContainer strList;
StringContainer tempFileNames;
unsigned int bufferLen = 0;
unsigned int tempFileCount = 0;
std::string line;
std::cout << "Start reading from: " << inFileName << std::endl;
int64_t filTime = do_profiling([&]()
{
while (true)
{
auto bEOF = !static_cast<bool>(std::getline(ifs, line, '\n'));
if (bEOF || (bufferLen + line.size() > memorySize))
{
strList.sort();
std::ofstream ofs(*tempFileNames.emplace(tempFileNames.end(), tempFolder + "temp." + std::to_string(tempFileCount++)));
std::copy(strList.begin(), strList.end(), std::ostream_iterator<std::string>(ofs, "\n"));
ofs.close();
strList.clear();
bufferLen = 0;
}
if (bEOF)
break;
bufferLen += line.size();
strList.emplace_back(std::move(line));
}
ifs.close();
});
std::cout << "Completed in: " << filTime << " milliseconds" << std::endl;
if (tempFileNames.size() == 1)
{
std::remove(outFilename.c_str());
std::rename(tempFileNames.front().c_str(), outFilename.c_str());
return 0;
}
std::cout << "Start merging and write to: " << outFilename << std::endl;
filTime = do_profiling([&]()
{
std::string mergedfilename = MergeFiles(tempFileNames);
std::remove(outFilename.c_str());
std::rename(mergedfilename.c_str(), outFilename.c_str());
});
std::cout << "Completed in: " << filTime << " milliseconds" << std::endl;
}
catch (std::exception& e)
{
std::cerr << "exception: " << e.what() << "\n";
}
return 0;
}