fork download
  1. #!/bin/bash
  2.  
  3. if [[ ! $1 ]] ; then
  4. echo -e "Nothing to do, exiting now..."
  5. exit
  6. fi
  7.  
  8. start_dir="$1"
  9. new_dir="$start_dir"_deduped
  10. new_dir_files="$new_dir"/files
  11.  
  12. mkdir -v "$new_dir"
  13. mkdir -v -p "$new_dir_files"
  14.  
  15. cd "$start_dir"
  16.  
  17. touch known_files_list.sha1
  18.  
  19. for webpage in * ; do
  20. # Skip directories
  21. if [[ $webpage == *"_files"* ]]; then
  22. continue
  23. fi
  24.  
  25. echo $webpage
  26.  
  27. # copy .html
  28. new_webpage="../$new_dir/$webpage"
  29. cp "$webpage" "$new_webpage"
  30.  
  31. # select matching directory
  32. html_dir=$(echo $webpage | sed -e "s/.html/_files/" -e "s/.htm/_files/" )
  33.  
  34. # check if matching directory exists
  35. if [ ! -d "$html_dir" ]; then
  36. echo "$html_dir is not found..."
  37. continue # move to the next page
  38. fi
  39.  
  40. # check if matching directory is not empty
  41. if [ -z "$(ls -A $html_dir)" ]; then
  42. echo "$html_dir is empty..."
  43. continue # move to the next page
  44. fi
  45.  
  46. # process files for webpage
  47. for file in $html_dir/* ; do
  48. # make hash of current file to compare
  49. new_file_hash=$(cat "$file" | sha1sum)
  50.  
  51. # try to find matching file from files already copied
  52. deduped_file_found=""
  53.  
  54. # fast check if file hash is in the list
  55. if grep -Fxq "$new_file_hash" known_files_list.sha1
  56. then
  57. # search for file with matching hash
  58. for deduped_file in ../$new_dir_files/* ; do
  59. deduped_file_hash=$(cat "$deduped_file" | sha1sum)
  60.  
  61. if [[ "$new_file_hash" == "$deduped_file_hash" ]]; then
  62. # found matching pair of files
  63. deduped_file_found="$deduped_file"
  64. break
  65. fi
  66. done
  67. fi
  68.  
  69. # extract current filename
  70. webpage_name=$(echo "$webpage" | sed -e "s/.htm$//" -e "s/.html$//")
  71. old_filename=$(echo "$file" | sed "s/.*\///")
  72.  
  73. if [ -z "$deduped_file_found" ]; then
  74. # duplicate file is not found, copy new one
  75. new_filename="$webpage_name$old_filename"
  76. old_path="$html_dir/$old_filename"
  77. new_path="files/$new_filename"
  78. cp $file ../$new_dir_files/$new_filename
  79. # Append file hash to optimize skipping unique files
  80. echo "$new_file_hash" >> known_files_list.sha1
  81. else
  82. #reuse existing file
  83. echo "-------------- deduplicated $file"
  84. new_filename=$(echo "$deduped_file_found" | sed "s/.*\///")
  85. old_path="$html_dir/$old_filename"
  86. new_path="files/$new_filename"
  87. fi
  88.  
  89. # replace all paths for file
  90. sed -i 's,'"$old_path,$new_path," $new_webpage
  91. done
  92. done
Success #stdin #stdout 0s 4288KB
stdin
Standard input is empty
stdout
Nothing to do, exiting now...