fork download
  1. #!/bin/bash
  2. # your code goes here
  3.  
  4. #!/bin/bash
  5.  
  6. if [[ ! $1 ]] ; then
  7. echo -e "Nothing to do, exiting now..."
  8. exit
  9. fi
  10.  
  11. start_dir="$1"
  12. new_dir="$start_dir"_deduped
  13. new_dir_files="$new_dir"/files
  14.  
  15. mkdir -v "$new_dir"
  16. mkdir -v -p "$new_dir_files"
  17.  
  18. cd "$start_dir"
  19.  
  20. for webpage in * ; do
  21. # Skip directories
  22. if [[ $webpage == *"_files"* ]]; then
  23. continue
  24. fi
  25.  
  26. echo $webpage
  27.  
  28. # copy .html
  29. new_webpage="../$new_dir/$webpage"
  30. cp "$webpage" "$new_webpage"
  31.  
  32. # select matching directory
  33. html_dir=$(echo $webpage | sed -e "s/.html/_files/" -e "s/.htm/_files/" )
  34.  
  35. # check if matching directory exists
  36. if [ ! -d "$html_dir" ]; then
  37. echo "$html_dir is not found..."
  38. continue # move to the next page
  39. fi
  40.  
  41. # check if matching directory is not empty
  42. if [ -z "$(ls -A $html_dir)" ]; then
  43. echo "$html_dir is empty..."
  44. continue # move to the next page
  45. fi
  46.  
  47. # process files for webpage
  48. for file in $html_dir/* ; do
  49. # make hash of current file to compare
  50. new_file_hash=$(cat "$file" | sha1sum)
  51.  
  52. # try to find matching file from files already copied
  53. deduped_file_found=""
  54.  
  55. # search for file with matching hash
  56. for deduped_file in ../$new_dir_files/* ; do
  57. deduped_file_hash=$(cat "$deduped_file" | sha1sum)
  58.  
  59. if [[ "$new_file_hash" == "$deduped_file_hash" ]]; then
  60. # found matching pair of files
  61. deduped_file_found="$deduped_file"
  62. break
  63. fi
  64. done
  65.  
  66. # extract current filename
  67. webpage_name=$(echo "$webpage" | sed -e "s/.htm$//" -e "s/.html$//")
  68. old_filename=$(echo "$file" | sed "s/.*\///")
  69.  
  70. if [ -z "$deduped_file_found" ]; then
  71. # duplicate file is not found, copy new one
  72. new_filename="$webpage_name$old_filename"
  73. old_path="$html_dir/$old_filename"
  74. new_path="files/$new_filename"
  75. cp $file ../$new_dir_files/$new_filename
  76. # Append file hash to optimize skipping unique files
  77. else
  78. #reuse existing file
  79. new_filename=$(echo "$deduped_file_found" | sed "s/.*\///")
  80. old_path="$html_dir/$old_filename"
  81. new_path="files/$new_filename"
  82. fi
  83.  
  84. # replace all paths for file
  85. sed -i 's,'"$old_path,$new_path," $new_webpage
  86. done
  87. done
Success #stdin #stdout 0s 4512KB
stdin
Standard input is empty
stdout
Nothing to do, exiting now...