<?php

$null = chr(0);
$data = file_get_contents('i');

// Find all the "words" from the data
preg_match_all('|\b(\w+)\b|', $data, $matches);

// Calculate the total bytes the word uses in the data
$freq = array()
foreach ($matches[0] as $word) {
    
    if (!isset($freq[$word])) { // Yet unused, initialize it
        $freq[$word]  = strlen($word);
    
    } else {                    // Seen before, just add to it
        $freq[$word] += strlen($word);
    }
}


// For first word we'll substitute it with '0', so start there.
$num = 0;

// Prepare our dictionary
$dict = array();

// For each of the words...
foreach ($freq as $word => $size) {
    // Set aside the length of the word we're substituting
    $len = strlen($word);
    
    // Calculate how many times the word was seen
    $seen = $size / $len;
    
    // If the (dictionary entry + substitution size * occurances)
    //   is less than the number of bytes the word currently takes or
    //   if the "word" happens to be a number...
    if ($len + $seen * strlen($num) < $size || is_int($word)) {
    
        // Then replace it with its representation in the data
        $data = preg_replace('|\b'. preg_quote($word) .'\b|', $num, $data);
        
        // Add the word to the dictionary (it's index is it's substitution)
        $dict[] = $word;
        
        // The next substitution gets the subsequent number
        $num++;
    }
}

// Glue the dictionary data together with nulls
$dict = implode($null, $dict);

// Glue the compressed data on the end of the dictionary with two nulls
$comp = $dict . $null . $null . $data;

// Write the whole thing to file
file_put_contents('o', $comp);

?>