#!/bin/bash

cat <<____
Welcome to this timing experiment.
Scroll down to the end of the page to see timings in the standard error box.

Standard output demonstrates how we create a file with 100 columns
(the output sample shows just the first five columns from the first three lines)
and split it into 100 files in each iteration.

Background: https://stackoverflow.com/questions/57942940

____

# Ideone boilerplate; can't create files in $HOME
t=$(mktemp -t -d 100cols.XXXXXXXX) || exit
trap 'rm -rf "$t"' EXIT HUP INT TERM
cd "$t"
: <<\=ignore
cat <<\____ >tripleee.py
import csv
from collections import defaultdict

with open('CROWN.csv', 'r') as input:
    r = csv.reader(input)
    columns = r.next()
    seen = [defaultdict(int)] * len(columns)
    # FIXME: runs out of file handles wnth too many columns
    handle = [open(columns[x] + '.csv') for x in len(columns)]
    for row in r:
        for field in len(row):
            value = row[field]
            if seen[field][value] == 0:
                handle.write(value + '\n')
            seen[field][value] += 1
    for h in handle:
         h.close()
____
=ignore

dd if=/dev/urandom bs=65536 count=24 2>/dev/null |
base64 |
awk '{ gsub("/", "_") } NR%100{printf "%s,", $0; next }1' >CROWN.csv

echo '*** Input ***'
ls -l CROWN.csv
wc CROWN.csv
head -n 3 CROWN.csv | cut -d, -f1-5

baseline () {
    cat "$@" >reallylongsinglefilename
}

mihir () {
    local col=1
    local file
    awk '{ gsub(/,/,"\n"); print; exit }' "$@" |
    while read -r file
    do
        awk -F, -v col="$col" '!x[$col]++ && NR != 1{print $col}' "$@" > "$file"
        ((col++))
    done
}

tripleee_py () {
    python tripleee.py "$@"
}

tripleee () {
    awk -F , 'NR==1 { ncols = split($0, cols, /,/); next }
    { for(i=1; i<=ncols; ++i)
        if (!seen[i ":" $i])
            print $i >>cols[i] ".csv"}' "$@"
}

sorin () {
    local field
    local name
    head -1 "$@" |
    tr "," "\n" |
    sed "s/ /_/g" |
    nl -ba -s$'\t' |
    while IFS=$'\t' read field name; do
      cut -f$field -d',' "$@" | \
      tail -n +2 | sort -u > "${name}.csv" ; 
    done
}

carmen () {
    perl -e '#!/bin/perl
    use strict;
    #use Text::CSV_XS qw(csv);
    use Text::CSV qw(csv);
    use Data::Dumper;

    my $data = csv(in => $ARGV[0]);

    my @headings = map {s/\s+/_/gr} @{shift @{$data}};

    my @udata = ();

    for my $row (@{$data}) {
        for (my $c = 0; $c <= $#headings; $c++) {
            $udata[$c]{$row->[$c]} = 1; 
        }
    }

    for (my $c = 0; $c <= $#headings; $c++) {
        local $\ = "\n";
        open(OUT, ">", $headings[$c].".pl.csv" );
        print OUT for sort keys %{$udata[$c]};
        close(OUT);
    }' "$@"
}

# Populate cache
cat CROWN.csv >/dev/null
for algo in baseline  carmen mihir sorin tripleee #tripeee_py
do
    echo "*** $algo ***" >&2
    echo "*** $algo ***"
    time $algo CROWN.csv
    ls -l | nl
    rm -f ????????????* ?.csv
done