/**********************************************************
* CSV.C
* A simple csv-parser.
*
* It was written due to a post at a popular Greek forum:
* http://w...content-available-to-author-only...a.gr/topic/530662-coding-challenge/
*
* The parser supports two parsing-models:
* a) FLAT : Tokens are stored in memory as consecutive fields
* b) LINED: Tokens are stored in memory as lines of fields
*
* Limitations:
*
* a) Only single-byte strings (i.e. no Unicode support)
* b) Only commas and newlines are recognised as delimiters
* c) No handling of quoted tokens (they are stored unchanged)
* d) No handling of blanks among tokens (they are embedded in tokens)
* f) In LINED model, all lines MUST have the same number of tokens
* g) CSV-files larger than the available memory are not handled.
*
* Implementation Notes:
*
* 1. Most (if not all) utility functions perform NO sanity-checks
* on their arguments. Constructors, however, return NULL on error.
*
* 2. CSV files are read at once into a memory buffer, via fread(),
* and then the buffer is parsed in memory. This is usually faster
* than reading & parsing blocks of the file, unless the write-cache
* of the OS gets fulled. Also, files larger than the available
* memory will not be loaded.
*
* 3. The Csv constructors build their parsing-model mostly by setting
* pointers on the loaded buffer. Thus, the parsing does not involve
* memory copying. It only involves minimal memory allocation for
* the internal structure of the parsing-model, along with at least
* one traversing of the buffered data.
*
* 4. The LINED parsing-model represents more accurately the logical
* structure of the csv-file, but the FLAT parsing-model takes
* less time to get initialized and populated.
*
* 5. For both models, a specialized _foreach() constructor is provided
* accepting a callback function and userdata as the last parameters.
* During the construction of the parsing-model, this function is
* called upon each parsing-unit (CsvString units for the FLAT model,
* CsvLine units for the LINED model).
*
* For complex calculations on all the data, this MAY decrease the
* overall time needed. However, for simple calculation it MAY
* increase the overall time (due to the overhead involved for the
* function calls and the extra pointers dereferencing).
*
* So, DO NOT use the _foreach() constructors blindly!
*
* For the LINED model, you can use the function csvline_field()
* inside your callback function, in order to get a pointer to
* any field of the current line. For example, to print the cstring
* and the length of the 4th field of EACH line during construction,
* you can use a callback function like the following:
*
* int callback_lined_foreach_print_4th_token(CsvLine *csvline, void *dummy)
* {
* CsvString *field = csvline_field(csvline, 3); // 0-based index
* printf( "%s (len: %zd)\n", field->cstr, field->len );
* return 1;
* }
*
* Now, your main() function may be something like the following:
*
* int main( void )
* {
* Csv *csv = make_lined_csv_foreach(
* "test.csv",
* callback_lined_foreach_print_4th_token,
* NULL
* );
* if ( !csv ) {
* return 1;
* }
* csv_free( csv );
* return 0;
* }
**********************************************************
*/
#ifndef CSV_H
#define CSV_H
#include <stddef.h>
/* ---------------------------
* Constants & Macros
* ---------------------------
*/
/*
* Supported Parsing Models
*/
enum {
CSVNONE = 0,
CSVFLAT,
CSVLINED
};
/* ---------------------------
* Data Types
* ---------------------------
*/
/*
* The dominant Csv data-type is hidden (can only be manipulated via funcs)
*/
typedef struct Csv Csv;
/* CsvString is exposed so clients can access directly the len and cstr
* fields of pointers returned by csv_flat_field() & csv_lined_field().
* Clients may also pass CsvString pointers to the csv_eql_cstr_field() function.
*/
typedef struct CsvString CsvString;
struct CsvString {
size_t len;
char *cstr;
};
typedef struct _CsvLine CsvLine;
/* ---------------------------
* Function Prototypes
* ---------------------------
*/
#ifndef CSV_C
/*
* constructors & destructors
*/
extern Csv *make_csv( const char *fname, int typid );
extern Csv *make_flat_csv_foreach(
const char *fname,
int (*foreach)(CsvString *field, void *userdata),
void *userdata
);
extern Csv *make_lined_csv_foreach(
const char *fname,
int (*foreach)(CsvLine *csvline, void *userdata),
void *userdata
);
extern Csv *csv_free( Csv *csv );
/*
* for data parsed in FLAT model
*/
extern size_t csv_flat_nfields( Csv *csv );
extern CsvString *csv_flat_field( Csv *csv, size_t idx );
/*
* for data parsed in LINED model
*/
extern size_t csv_lined_nlines( Csv *csv );
extern int csv_lined_nfields( Csv *csv );
extern CsvString *csv_lined_field( Csv *csv, size_t ln, int idx );
extern CsvString *csvline_field( CsvLine *csvline, int idx );
/*
* misc functions
*/
extern void csv_print( const Csv *csv );
/**
@brief Check byte equality between a plain cstring and a CsvString->cstr.
@param cstrlen: The length of the plain cstring, w/o the trailing NUL byte.
@param cstring: The plain cstring to be compared.
@param *field: A pointer to an already csv parsed field, whose .cstr
member is to be checked against cstring.
@note Depending on the nature of the parsed data, this function may be
faster than strcmp/memcmp because it calls memcmp only if cstrlen
differs from field->len.
*/
extern int csv_eql_cstr_field( size_t cstrlen, char *cstring, CsvString *field);
#endif
#endif