#!/bin/bash
# your code goes here

#!/bin/bash

if [[ ! $1 ]] ; then
    echo -e "Nothing to do, exiting now..."
    exit
fi

start_dir="$1"
new_dir="$start_dir"_deduped
new_dir_files="$new_dir"/files

mkdir -v "$new_dir"
mkdir -v -p "$new_dir_files"

cd "$start_dir"

for webpage in * ; do
    # Skip directories
    if [[ $webpage == *"_files"* ]]; then
        continue
    fi

    echo $webpage

    # copy .html
    new_webpage="../$new_dir/$webpage"
    cp "$webpage" "$new_webpage"

    #  select matching directory
    html_dir=$(echo $webpage | sed -e "s/.html/_files/" -e "s/.htm/_files/" )

    # check if matching directory exists
    if [ ! -d "$html_dir" ]; then
        echo "$html_dir is not found..."
        continue # move to the next page
    fi
    
    # check if matching directory is not empty
    if [ -z "$(ls -A $html_dir)" ]; then
        echo "$html_dir is empty..."
        continue # move to the next page
    fi

    # process files for webpage
    for file in $html_dir/* ; do
            # make hash of current file to compare
            new_file_hash=$(cat "$file" | sha1sum)

            # try to find matching file from files already copied
            deduped_file_found=""

                # search for file with matching hash
                for deduped_file in ../$new_dir_files/* ; do
                    deduped_file_hash=$(cat "$deduped_file" | sha1sum)
                    
                    if [[ "$new_file_hash" == "$deduped_file_hash" ]]; then
                        # found matching pair of files
                        deduped_file_found="$deduped_file"
                        break
                    fi
                done

            # extract current filename
            webpage_name=$(echo "$webpage" | sed -e "s/.htm$//" -e "s/.html$//")
            old_filename=$(echo "$file" | sed "s/.*\///")

            if [ -z "$deduped_file_found" ]; then 
                # duplicate file is not found, copy new one
                new_filename="$webpage_name$old_filename"
                old_path="$html_dir/$old_filename"
                new_path="files/$new_filename"
                cp $file ../$new_dir_files/$new_filename
                # Append file hash to optimize skipping unique files
            else
                #reuse existing file
                new_filename=$(echo "$deduped_file_found" | sed "s/.*\///")
                old_path="$html_dir/$old_filename"
                new_path="files/$new_filename"          
            fi

            # replace all paths for file
            sed -i 's,'"$old_path,$new_path," $new_webpage
    done
done