fork(1) download
  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileReader;
  4. import java.io.InputStreamReader;
  5. import java.util.Locale;
  6. import java.util.regex.Matcher;
  7. import java.util.regex.Pattern;
  8.  
  9. // TODO: add MoviePathMatcher
  10. // TODO: add TvShowPathMatcher
  11. // TODO: add other tvShow
  12.  
  13. public class TestScraper {
  14.  
  15. public static void main(String[] args) throws Exception {
  16. // for stdin
  17. BufferedReader inputFileReader = new BufferedReader(new InputStreamReader(System.in));
  18. // for file input
  19. // FileReader inputFileReader = new FileReader("input.txt");
  20. BufferedReader reader = new BufferedReader(inputFileReader);
  21. String test;
  22. while ((test = reader.readLine()) != null) {
  23. getMatch(getFileNameWithoutExtension(test));
  24. println();
  25. }
  26. }
  27.  
  28. /**
  29.   * Matches everything. Tries to strip away all junk, not very reliable.
  30.   * <p>
  31.   * Process is as follows:
  32.   * <ul>
  33.   * <li> Start with filename without extension: "100. [DVD]Starship_Troopers_1995.-HDrip--IT"
  34.   * <li> Remove potential starting numbering of collections "[DVD]Starship_Troopers_1995.-HDrip--IT"
  35.   * <li> Extract last year if any: "[DVD]Starship_Troopers_.-HDrip--IT"
  36.   * <li> Remove anything in brackets: "Starship_Troopers_.-HDrip--IT"
  37.   * <li> Assume from here on that the title is first followed by junk
  38.   * <li> Trim CasE sensitive junk: "Starship_Troopers_.-HDrip" ("it" could be part of the movie name, "IT" probably not)
  39.   * <li> Remove separators: "Starship Troopers HDrip"
  40.   * <li> Trim junk case insensitive: "Starship Troopers"
  41.   * </ul>
  42.   */
  43. private static void getMatch(String input) {
  44. // TODO test 3rd party denoise pattern
  45. // denoise filter Default = @"(([\(\{\[]|\b)((576|720|1080)[pi]|dir(ectors )?cut|dvd([r59]|rip|scr(eener)?)|(avc)?hd|wmv|ntsc|pal|mpeg|dsr|r[1-5]|bd[59]|dts|ac3|blu(-)?ray|[hp]dtv|stv|hddvd|xvid|divx|x264|dxva|(?-i)FEST[Ii]VAL|L[iI]M[iI]TED|[WF]S|PROPER|REPACK|RER[Ii]P|REAL|RETA[Ii]L|EXTENDED|REMASTERED|UNRATED|CHRONO|THEATR[Ii]CAL|DC|SE|UNCUT|[Ii]NTERNAL|[DS]UBBED)([\]\)\}]|\b)(-[^\s]+$)?)")]
  46.  
  47. String name = input;
  48. println("input : " + name);
  49.  
  50. // extract the last year from the string
  51. String year = null;
  52. // matches "[space or punctuation/brackets etc]year", year is group 1
  53. final Pattern YEAR_PATTERN = Pattern.compile("[\\s\\p{Punct}]((?:19|20)\\d{2})(?!\\d)");
  54. Matcher matcher = YEAR_PATTERN.matcher(name);
  55. int start = 0;
  56. int stop = 0;
  57. boolean found = false;
  58. while (matcher.find()) {
  59. found = true;
  60. start = matcher.start(1);
  61. stop = matcher.end(1);
  62. }
  63. // get the last match and extract it from the string
  64. if (found) {
  65. year = name.substring(start, stop);
  66. name = name.substring(0, start) + name.substring(stop);
  67. }
  68. println("release year : %s year:%s", name, year);
  69.  
  70.  
  71. // Strip out starting numbering for collections
  72. // Matches "1. ", "1) ", "1 - ", "1.-.", "1._"... but not "1.Foo" or "1-Foo" ..
  73. final Pattern LEADING_NUMBERING = Pattern.compile("^(\\d+([.)][\\s\\p{Punct}]+|\\s+\\p{Punct}[\\p{Punct}\\s]*))*");
  74. final Pattern LEADING_NUMBERING2 = Pattern.compile("^([0-9]+[ _]*[\\-\\.]+[ _]*)*");
  75. name = replaceAll(name, "", LEADING_NUMBERING);
  76. println("remove numbering : " + name);
  77.  
  78. // Strip out everything else in brackets <[{( .. )})>, most of the time teams names, etc
  79. final Pattern BRACKETS = Pattern.compile("[<({\\[].+?[>)}\\]]");
  80. final Pattern BRACKETS2 = Pattern.compile("[<\\(\\[\\{].+?[>\\)\\]\\}]");
  81. name = replaceAll(name, "", BRACKETS);
  82. println("brackets : " + name);
  83.  
  84. // strip away known case sensitive garbage
  85. name = cutOffBeforeFirstMatch(name, GARBAGE_CASESENSITIVE_PATTERNS);
  86. println("CaSe junk : " + name);
  87.  
  88. // removes all punctuation characters besides ' Also does apostrophe and Acronym replacement
  89. // replace all remaining whitespace & punctuation with a single space
  90. // break what does removeInnerAndOutterSeparatorJunk(name);
  91.  
  92. // -> this is unifyApostrophes(name)
  93. // replaces alternative apostrophes with a simple '
  94. // besides the plain ' there is the typographic ’ and ‘ which is actually not an apostrophe
  95. final char[] ALTERNATE_APOSTROPHES = new char[]{'’', '‘'};
  96. name = replaceAllChars(name, ALTERNATE_APOSTROPHES, '\'');
  97. println("apostrophes : " + name);
  98.  
  99. // -> this is replaceAcronyms(name)
  100. // Matches dots in between Uppercase letters e.g. in "E.T.", "S.H.I.E.L.D." but not a "a.b.c."
  101. // replaces "S.H.I.E.L.D." with "SHIELD", only uppercase letters
  102. // Last dot is kept "a.F.O.O.is.foo" => "a.FOO.is.foo"
  103. final Pattern ACRONYM_DOTS = Pattern.compile("(?<=(\\b|[._])\\p{Lu})[.](?=\\p{Lu}([.]|$))");
  104. name = replaceAll(name, "", ACRONYM_DOTS);
  105. println("acronyms : " + name);
  106.  
  107. // -> this is the end of removeInnerAndOutterSeparatorJunk(name)
  108. // ( whitespace | punctuation)+, matches dots, spaces, brackets etc
  109. final Pattern MULTI_NON_CHARACTER_PATTERN = Pattern.compile("[\\s\\p{Punct}&&[^']]+");
  110. name = replaceAll(name, " ", MULTI_NON_CHARACTER_PATTERN).trim();
  111. println("separators : " + name);
  112.  
  113. // append a " " to aid next step
  114. // > "Foo bar 1080p AC3 " to find e.g. " AC3 "
  115. name = name + " ";
  116.  
  117. // try to remove more garbage, this time " garbage " syntax
  118. // method will compare with lowercase name automatically
  119. name = cutOffBeforeFirstMatch(name, GARBAGE_LOWERCASE);
  120. println("lowercase junk : " + name);
  121.  
  122. name = name.trim();
  123. println("RESULT : %s year:%s", name, year);
  124. }
  125.  
  126. // Most of the common garbage in movies name we want to strip out
  127. // (they can be part of the name or correspond to extensions as well).
  128. private static final String[] GARBAGE_LOWERCASE = {
  129. " dvdrip ", " dvd rip ", "dvdscreener ", " dvdscr ", " dvd scr ",
  130. " brrip ", " br rip ", " bdrip", " bd rip ", " blu ray ", " bluray ",
  131. " hddvd ", " hd dvd ", " hdrip ", " hd rip ", " hdlight ", " minibdrip ",
  132. " webrip ", " web rip ",
  133. " 720p ", " 1080p ", " 1080i ", " 720 ", " 1080 ", " 480i ", " 2160p ", " 4k ", " 480p ", " 576p ", " 576i ", " 240p ", " 360p ", " 4320p ", " 8k ",
  134. " hdtv ", " sdtv ", " m hd ", " ultrahd ", " mhd ",
  135. " h264 ", " x264 ", " aac ", " ac3 ", " ogm ", " dts ", " hevc ", " x265 ", " av1 ",
  136. " avi ", " mkv ", " xvid ", " divx ", " wmv ", " mpg ", " mpeg ", " flv ", " f4v ",
  137. " asf ", " vob ", " mp4 ", " mov ",
  138. " directors cut ", " dircut ", " readnfo ", " read nfo ", " repack ", " rerip ", " multi ", " remastered ",
  139. " truefrench ", " srt ", " extended cut ",
  140. " sbs ", " hsbs ", " side by side ", " sidebyside ", /* Side-By-Side 3d stuff */
  141. " 3d ", " h sbs ", " h tb ", " tb ", " htb ", " top bot ", " topbot ", " top bottom ", " topbottom ", " tab ", " htab ", /* Top-Bottom 3d stuff */
  142. " anaglyph ", " anaglyphe ", /* Anaglyph 3d stuff */
  143. " truehd ", " atmos ", " uhd ", " hdr10+ ", " hdr10 ", " hdr ", " dolby ", " dts-x ", " dts-hd.ma ",
  144. " hfr ",
  145. };
  146. // stuff that could be present in real names is matched with tight case sensitive syntax
  147. // strings here will only match if separated by any of " .-_"
  148. private static final String[] GARBAGE_CASESENSITIVE = {
  149. "FRENCH", "TRUEFRENCH", "DUAL", "MULTISUBS", "MULTI", "MULTi", "SUBFORCED", "SUBFORCES", "UNRATED", "UNRATED[ ._-]DC", "EXTENDED", "IMAX",
  150. "COMPLETE", "PROPER", "iNTERNAL", "INTERNAL",
  151. "SUBBED", "ANiME", "LIMITED", "REMUX", "DCPRip",
  152. "TS", "TC", "REAL", "HD", "DDR", "WEB",
  153. "EN", "ENG", "FR", "ES", "IT", "NL", "VFQ", "VF", "VO", "VOSTFR", "Eng",
  154. "VOST", "VFF", "VF2", "VFI", "VFSTFR",
  155. };
  156.  
  157. private static final Pattern[] GARBAGE_CASESENSITIVE_PATTERNS = new Pattern[GARBAGE_CASESENSITIVE.length];
  158.  
  159. static {
  160. for (int i = 0; i < GARBAGE_CASESENSITIVE.length; i++) {
  161. // case sensitive string wrapped in "space or . or _ or -", in the end either separator or end of line
  162. // end of line is important since .foo.bar. could be stripped to .foo and that would no longer match .foo.
  163. GARBAGE_CASESENSITIVE_PATTERNS[i] = Pattern.compile("[ ._-]" + GARBAGE_CASESENSITIVE[i] + "(?:[ ._-]|$)");
  164. }
  165. }
  166.  
  167. // ( whitespace | punctuation), matches dots, spaces, brackets etc
  168. private static final String NON_CHARACTER = "[\\s\\p{Punct}]";
  169.  
  170. // matches "19XX and 20XX" - capture group
  171. private static final String YEAR_GROUP = "((?:19|20)\\d{2})";
  172.  
  173. /**
  174.   * assumes title is always first
  175.   * @return substring from start to first finding of any garbage pattern
  176.   */
  177. private static String cutOffBeforeFirstMatch(String input, Pattern[] patterns) {
  178. String remaining = input;
  179. for (Pattern pattern : patterns) {
  180. if (remaining.isEmpty()) return "";
  181.  
  182. Matcher matcher = pattern.matcher(remaining);
  183. if (matcher.find()) {
  184. remaining = remaining.substring(0, matcher.start());
  185. }
  186. }
  187. return remaining;
  188. }
  189.  
  190. /**
  191.   * assumes title is always first
  192.   * @param garbageStrings lower case strings
  193.   * @return substring from start to first finding of any garbage string
  194.   */
  195. public static final String cutOffBeforeFirstMatch(String input, String[] garbageStrings) {
  196. // lower case input to test against lowercase strings
  197. String inputLowerCased = input.toLowerCase(Locale.US);
  198.  
  199. int firstGarbage = input.length();
  200.  
  201. for (String garbage : garbageStrings) {
  202. int garbageIndex = inputLowerCased.indexOf(garbage);
  203. // if found, shrink to 0..index
  204. if (garbageIndex > -1 && garbageIndex < firstGarbage)
  205. firstGarbage = garbageIndex;
  206. }
  207.  
  208. // return substring from input -> keep case
  209. return input.substring(0, firstGarbage);
  210. }
  211.  
  212.  
  213. public static String replaceAllChars(String input, char[] badChars, char newChar) {
  214. if (badChars == null || badChars.length == 0)
  215. return input;
  216. int inputLength = input.length();
  217. int replacementLenght = badChars.length;
  218. boolean modified = false;
  219. char[] buffer = new char[inputLength];
  220. input.getChars(0, inputLength, buffer, 0);
  221. for (int inputIdx = 0; inputIdx < inputLength; inputIdx++) {
  222. char current = buffer[inputIdx];
  223. for (int replacementIdx = 0; replacementIdx < replacementLenght; replacementIdx++) {
  224. if (current == badChars[replacementIdx]) {
  225. buffer[inputIdx] = newChar;
  226. modified = true;
  227. break;
  228. }
  229. }
  230. }
  231. return modified ? new String(buffer) : input;
  232. }
  233.  
  234. public static String replaceAll(String input, String replacement, Pattern pattern) {
  235. return pattern.matcher(input).replaceAll(replacement);
  236. }
  237.  
  238. private static void println() {
  239. System.out.println();
  240. }
  241.  
  242. private static void println(String in) {
  243. System.out.println(in);
  244. }
  245.  
  246. private static void println(String in, Object... args) {
  247. System.out.println(String.format(in, args));
  248. }
  249.  
  250. private static String getFileNameWithoutExtension(String input) {
  251. File file = new File(input);
  252. String name = file.getName();
  253. if (name != null) {
  254. int dotPos = name.lastIndexOf('.');
  255. if (dotPos > 0) {
  256. name = name.substring(0, dotPos);
  257. }
  258. }
  259. return name;
  260. }
  261. }
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
./Furia a Bahia pour OSS 117 1965 1080 FR X264 AAC-mHDgz.mkv
./Oss 117 Se Dechaine 1963 1080P FR X264 AAC-mHDgz.mkv
./Banco a Bangkok pour OSS 117 1964 1080p FR X264 AAC-mHDgz.mkv
./Pas De Roses Pour Oss 117 (1968) HDTV - 1080p FR x264 ac3 mHDgz.mkv
./L'arme fatale - X264 - HD 720p - FR - ENG - SRT FR/L'arme fatale - X264 - HD 720p - FR - ENG - SRT FR.mkv
./L'arme fatale 2 - X264 - HD 720p - FR - ENG - SRT FR/L'arme fatale 2 - X264 - HD 720p - FR - ENG - SRT FR.mkv
./L'arme fatale 3 - X264 - HD 720p - FR - ENG - SRT FR/L'arme fatale 3 - X264 - HD 720p - FR - ENG - SRT FR.mkv
./L'arme fatale 4 - X264 - HD 720p - FR - ENG - SRT FR/L'arme fatale 4 - X264 - HD 720p - FR - ENG - SRT FR.mkv
100. Toy Story 3 (2010).mp4
101. Raiponce (2010).mp4
104. Rebelle (2012).mp4
./il_était_une_fois_les_découvreurs/19 - Marconi et les ondes.avi
./il_était_une_fois_les_découvreurs/06 - Gutenberg.avi
./20_courts_metrages_pixar
./Les_fous_du_volant/19 - Coupons les ponts.avi
./Les_fous_du_volant/14 - Quel cirque.avi
./Les_fous_du_volant/16 - Scions scions du bois.avi
./Tintin/16 - 17 - Les 7 boules de cristal - Tintin et le temple du soleil.avi
./Tintin/18 - Vol 714 pour Sidney.avi
./Tintin/11 - 12 - L'affaire Tournesol - Tintin et les Picaros.avi
./Tintin/05 - 06 - L'Ile noire - Le sceptre d'Ottokar.avi
./Tintin/09 - L'étoile Mysterieuse.avi
./Tintin/10 - Tintin Au Tibet.avi
./Pingu/16-Pingu's_Outing.avi
./Pingu/08-Pingu-Runs_Away.avi
./Pingu/06-Pingu-And_The_Snowball_Fight.avi
./Pingu/03-Pingu-Goes_Fishing.avi
./15_Films_(Georges_melies,1898-1909).avi
./Maryne_Monroe/7_ans_de_reflexion_(1955)-FRENCH-DVDrip-panisa.mkv
./36_quais_des_orpheves.avi
./Starship_Troopers_(Verhoeven_1997)-HDrip-panisa.mkv
./Mad Max 3 (1985) Au dela du dome du tonnerre VF+VOstfr HDrip.mkv
./foo/Green.Lantern.Emerald.Knights.2011.1080p.AC3.DTS.NL.Subs.DMT.mkv
./foo/Ultraviolet.REAL.PROPER.DVDSCR.XviD-PUKKA.avi
./foo/Underworld.avi
./foo/The.Incredibles.DVDSCR.XviD.cd1.avi
./foo/The.Adventures.of.Tintin.2011.720p.BluRay.x264-MHD.mkv
./foo/The.Chronicles.of.Riddick.DirCut.2004.720p.HDDVD.DTS.x264.Sample-ESiR.mkv
./Harry.Potter.And.The.Goblet.Of.Fire.2005.HDRip.XviD-TLF-CD2.srt
./TheAnimatrix.avi
./Harry.Potter.And.The.Goblet.Of.Fire.2005.HDRip.XviD-TLF-CD2.avi
compilation info
Main.java:13: error: class TestScraper is public, should be declared in a file named TestScraper.java
public class TestScraper {
       ^
1 error
stdout
Standard output is empty