#include <iostream>
#include <fstream>
#include <vector>
#include <iomanip>
#include <cstring> // memset()
extern "C"{
#include "f_size/f_size.h" // x-platform routine for getting a filesize
}
using namespace std;
#define PARSE_TWICE 1 // 0 to enable totally dynamic insertions
#define STEPPED_TIMING 1 // 0 to disable individual timing for passes
// (only available when PARSE_TWICE )
const char* FNAME = (const char*) "test1.csv";
typedef struct
{
// mostly used for loading a csv file in mem
char *buf;
const char *fname;
int64_t fsize;
int64_t n;
// mostly used for parsing loaded data
vector< vector<string> > lines;
size_t nlines;
} CallbackData;
/*********************************************************//**
* Return the size of a file in bytes, or -1 on error (it only
* works for files with size <= LONG_MAX).
* N O T E : Use this function if the more sophisticated f_size()
* does not work with your compiler (see: #include "f_size.h").
*************************************************************
*/
std::streampos file_size( const char* fname )
{
std::streampos fsize = -1;
std::ifstream f( fname, std::ios::binary | std::ios::ate ); // binary-mode
if ( f.is_open() ) {
fsize = f.tellg();
f.close();
}
return fsize;
}
/*********************************************************//**
* Return the wall-clock time spent for executing the callback
* function, in secs. On error, return -1.0
*************************************************************
*/
#include <sys/time.h>
double time_it( void *userdata, int (*callback)(void *userdata) )
{
double tstart, tend;
struct timeval tv;
gettimeofday( &tv, NULL );
tstart = tv.tv_sec + tv.tv_usec / 1000000.0;
/* Code we want timed here */
if ( 0 == (*callback)(userdata) ) {
return -1.0;
}
gettimeofday( &tv, NULL );
tend = tv.tv_sec + tv.tv_usec / 1000000.0;
return tend - tstart;
}
/*********************************************************//**
* Return the cpu-time spent for executing the callback function,
* in secs. On error, return -1.0
* N O T E : Depending on the implementation, on Windows it MOST
* PROBABLY returns the wall-clock time instead of the cpu-time.
*************************************************************
*/
#include <ctime>
double clock_it( void *userdata, int (*callback)(void *userdata) )
{
clock_t tstart = clock();
/* Code we want timed here */
if ( 0 == (*callback)(userdata) ) {
return -1.0;
}
return ((double) (clock() - tstart)) / CLOCKS_PER_SEC;
}
/*********************************************************//**
* x-platform alternative to Windows system( "pause" );
*************************************************************
*/
void press_enter( void )
{
int c;
cout << "press ENTER... ";
cout.flush();
cin.clear();
while ( '\n' != (c=cin.get()) && EOF != c )
;
}
/*********************************************************//**
* Timing callback function for reading a file into a buffer.
* All required variables are passed and/or get modified via
* the data pointer.
*************************************************************
*/
int cb_read_file_to_buf( void *data )
{
CallbackData *d = (CallbackData *) data;
ifstream f;
// get filesize
d->fsize = f_size( d->fname );
if ( d->fsize < 1 ) {
return 0;
}
f.open( d->fname ); // text-mode
if ( !f.is_open() ) {
return 0;
}
d->buf = new char[ 1 + d->fsize ];
f.read( d->buf, d->fsize );
d->n = f.gcount(); // actual bytes read
f.close();
d->buf[d->n] = d->buf[d->fsize] = '\0'; // NUL terminte buf
return 1;
}
/*********************************************************//**
* Timing callback function for parsing the csv buf once, in
* order to convert it into a vector of lines, where each line
* is a vector of fields (strings).
* All required variables are passed and/or get modified via
* the data pointer.
* N O T E : This function is used when !PARSED_TWICE
*************************************************************
*/
int cb_parse_csvbuf_once( void *data )
{
CallbackData *d = (CallbackData *) data;
// if ( !d->buf ) {
// return 0;
// }
size_t l = 0;
char* cp = d->buf;
char* pre = d->buf;
vector<string> field;
while ( *cp )
{
if ( ',' == *cp ) {
*cp = '\0';
field.push_back( pre );
pre = cp + 1;
}
else if ( '\n' == *cp ) {
*cp = '\0';
field.push_back( pre );
d->lines.push_back( field );
field.clear();
pre = cp + 1;
l++;
}
cp++;
}
d->nlines = l;
return 1;
}
/*********************************************************//**
* Timing callback function for counting lines in the csv buf.
* All required variables are passed and/or get modified via
* the data pointer.
* N O T E : This function is used when PARSED_TWICE && STEPPED_TIMING
*************************************************************
*/
static inline int cb_parse_csvbuf_pass1( void *data )
{
CallbackData *d = (CallbackData *) data;
// if ( !d->buf ) {
// return 0;
// }
d->nlines = 0;
for (char* cp = d->buf; *cp; cp++) {
if ( '\n' == *cp )
(d->nlines)++;
}
return 1;
}
/*********************************************************//**
* Timing callback function for converting the csv buf into a
* vector of lines, where each line is a vector of fields (strings).
* Contrary to the function cb_parse_csvbuf_once() which allocates
* all vectors progressively, this one assumes that the function
* cb_parse_csvbuf_pass1() has been already called (and thus has
* already set data->nlines), in order to pre-allocate the line
* vectors.
* All required variables are passed and/or get modified via
* the data pointer.
* N O T E : This function is used when PARSED_TWICE && STEPPED_TIMING
*************************************************************
*/
static inline int cb_parse_csvbuf_pass2( void *data )
{
CallbackData *d = (CallbackData *) data;
// if ( !d->buf || d->nlines < 1 ) {
// return 0;
// }
d->lines.resize( d->nlines ); // pre-allocate the line vectors
size_t l = 0; // lines counter
char* cp = d->buf; // bytes counter
char* pre = d->buf; // start of current cstring in buf
while ( *cp )
{
if ( ',' == *cp ) {
*cp = '\0';
d->lines[l].push_back( pre );
pre = cp + 1;
}
else if ( '\n' == *cp ) {
*cp = '\0';
d->lines[l].push_back( pre );
pre = cp + 1;
l++;
}
cp++;
}
return 1;
}
/*********************************************************//**
* Timing callback function for counting lines in the csv buf
* and then converting the csv buf into a vector of lines,
* where each line is a vector of fields (strings).
* All required variables are passed and/or get modified via
* the data pointer.
* N O T E : This function is used when PARSED_TWICE && !STEPPED_TIMING
*************************************************************
*/
int cb_parse_csvbuf_twice( void *data )
{
return cb_parse_csvbuf_pass1(data) && cb_parse_csvbuf_pass2(data);
}
/*********************************************************//**
* Print the given line vectors
*************************************************************
*/
void lines_print( const vector< vector<string> >& lines )
{
cout << "lines: " << lines.size() << endl;
for (size_t l=0; l < lines.size(); l++) {
cout << "--- line: " << l+1 << " (" << lines[l].size() << " fields) ----\n";
for (size_t f=0; f < lines[l].size(); f++) {
cout << lines[l][f] << endl;
}
//cout << endl;
}
/*
// or in C++11
size_t ln=0;
cout << "lines: " << lines.size() << endl;
for ( auto& l: lines ) {
cout << "--- line: " << ++ln << " (" << l.size() << " fields) ----\n";
for ( auto& f: l ) {
cout << f << endl;
}
}
*/
}
/*********************************************************//**
*
*************************************************************
*/
int main()
{
CallbackData cd;
memset( &cd, 0, sizeof(cd) );
cd.fname = FNAME;
double elapsed0 = 0.0;
double elapsed1 = 0.0;
double elapsed2 = 0.0;
cout.precision(5);
cout << fixed;
/* Read csv file into memory buffer. */
cout << "*** " << cd.fname << " ***\n\n";
cout << "Loading... ";
cout.flush();
elapsed0 = time_it( (void *)&cd, cb_read_file_to_buf );
if ( -1.0 == elapsed0 ) {
goto exit_failure;
}
cout << elapsed0 << " secs\n";
cout << "(" << cd.n << " text-bytes | " << cd.fsize << " binary-bytes)\n\n";
/* Parse buf into lines & fields. */
cout << "Parsing... " << endl;;
{
#if PARSE_TWICE
#if STEPPED_TIMING
cout << "pass-1... ";
cout.flush();
elapsed1 = time_it( (void *)&cd, cb_parse_csvbuf_pass1 );
if ( -1.0 == elapsed1 ) {
goto exit_failure;
}
cout << elapsed1 << " secs\n";
cout << "pass-2... ";
cout.flush();
elapsed2 = time_it( (void *)&cd, cb_parse_csvbuf_pass2 );
if ( -1.0 == elapsed2 ) {
goto exit_failure;
}
cout << elapsed2 << " secs" << endl;
cout << "Parsed in: " << elapsed1 + elapsed2 << " secs (" << cd.nlines << " lines)\n\n";
#else
cout << "2 passes...";
cout.flush();
elapsed1 = time_it( (void *)&cd, cb_parse_csvbuf_twice );
if ( -1.0 == elapsed1 ) {
goto exit_failure;
}
cout << elapsed1 << " secs (" << cd.nlines << " lines)\n\n";
#endif
#else
cout << "1 pass... ";
elapsed1 = time_it( (void *)&cd, cb_parse_csvbuf_once );
if ( -1.0 == elapsed1 ) {
goto exit_failure;
}
cout << elapsed1 << " secs (" << cd.nlines << " lines)\n\n";
#endif
}
cout << "Total: " << elapsed0 + elapsed1 + elapsed2 << " secs (loading + parsing)\n\n";
//lines_print( cd.lines );
/* Cleanup and exit. */
delete[] cd.buf;
press_enter();
cout << "cleaning up, please wait..." << endl;
return 0;
exit_failure:
cerr << "*** error ***" << endl;
cout << "(cleaning up, please wait...)" << endl;
if ( cd.buf ) {
delete[] cd.buf;
}
press_enter();
return 1;
}