#!/bin/bash
if [[ ! $1 ]] ; then
echo -e "Nothing to do, exiting now..."
exit
fi
start_dir="$1"
new_dir="$start_dir"_deduped
new_dir_files="$new_dir"/files
mkdir -v "$new_dir"
mkdir -v -p "$new_dir_files"
cd "$start_dir"
touch known_files_list.sha1
for webpage in * ; do
# Skip directories
if [[ $webpage == *"_files"* ]]; then
continue
fi
echo $webpage
# copy .html
new_webpage="../$new_dir/$webpage"
cp "$webpage" "$new_webpage"
# select matching directory
html_dir=$(echo $webpage | sed -e "s/.html/_files/" -e "s/.htm/_files/" )
# check if matching directory exists
if [ ! -d "$html_dir" ]; then
echo "$html_dir is not found..."
continue # move to the next page
fi
# check if matching directory is not empty
if [ -z "$(ls -A $html_dir)" ]; then
echo "$html_dir is empty..."
continue # move to the next page
fi
# process files for webpage
for file in $html_dir/* ; do
# make hash of current file to compare
new_file_hash=$(cat "$file" | sha1sum)
# try to find matching file from files already copied
deduped_file_found=""
# fast check if file hash is in the list
if grep -Fxq "$new_file_hash" known_files_list.sha1
then
# search for file with matching hash
for deduped_file in ../$new_dir_files/* ; do
deduped_file_hash=$(cat "$deduped_file" | sha1sum)
if [[ "$new_file_hash" == "$deduped_file_hash" ]]; then
# found matching pair of files
deduped_file_found="$deduped_file"
break
fi
done
fi
# extract current filename
webpage_name=$(echo "$webpage" | sed -e "s/.htm$//" -e "s/.html$//")
old_filename=$(echo "$file" | sed "s/.*\///")
if [ -z "$deduped_file_found" ]; then
# duplicate file is not found, copy new one
new_filename="$webpage_name$old_filename"
old_path="$html_dir/$old_filename"
new_path="files/$new_filename"
cp $file ../$new_dir_files/$new_filename
# Append file hash to optimize skipping unique files
echo "$new_file_hash" >> known_files_list.sha1
else
#reuse existing file
echo "-------------- deduplicated $file"
new_filename=$(echo "$deduped_file_found" | sed "s/.*\///")
old_path="$html_dir/$old_filename"
new_path="files/$new_filename"
fi
# replace all paths for file
sed -i 's,'"$old_path,$new_path," $new_webpage
done
done