fork(22) download
  1. //printenv
  2. // JAVA_HOME=/usr/java/jdk1.7.0_67-cloudera
  3. // PATH=/usr/java/jdk1.7.0_67-cloudera/bin
  4. // export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
  5.  
  6. // hadoop com.sun.tools.javac.Main WordCount3.java
  7. // jar cf WordCount2.jar WordCount2*.class
  8. // hadoop jar WordCount2.jar WordCount2 big.txt outWCBig
  9. // hadoop jar WordCount2.jar WordCount2 t8.shakespeare.txt outWCShake
  10.  
  11. // t8.shakespeare
  12.  
  13. // Remove the previous results.
  14. // $ hadoop fs -rm -r -f /user/cloudera/wordcount/output
  15.  
  16.  
  17. import java.io.* ;
  18. import java.util.StringTokenizer;
  19.  
  20. import org.apache.hadoop.conf.Configured;
  21. import org.apache.hadoop.conf.Configuration ;
  22. import org.apache.hadoop.fs.Path;
  23. import org.apache.hadoop.io.IntWritable;
  24. import org.apache.hadoop.io.LongWritable;
  25. import org.apache.hadoop.io.Text;
  26. import org.apache.hadoop.mapreduce.jobWordCount;
  27. import org.apache.hadoop.mapreduce.Mapper;
  28. import org.apache.hadoop.mapreduce.Reducer;
  29. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  30. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  31. import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
  32. import org.apache.hadoop.util.GenericOptionsParser;
  33. import org.apache.hadoop.util.Tool;
  34. import org.apache.hadoop.util.ToolRunner;
  35.  
  36.  
  37. import org.apache.hadoop.mapreduce.Job;
  38. import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
  39. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  40. import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
  41. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
  42. import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
  43. import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
  44.  
  45. public class WordCount2 extends Configured implements Tool
  46. {
  47. private static final Logger LOG = Logger.getLogger(WordCount2.class);
  48.  
  49. public static void main(String[] args) throws Exception {
  50. int exitCode = ToolRunner.run(new WordCount2(), args);
  51. System.exit(exitCode);
  52. }
  53.  
  54. public static boolean isNullOrEmpty(String str) {
  55. if(str != null && !str.trim().isEmpty())
  56. return false;
  57. return true;
  58. }
  59.  
  60.  
  61. public static class PunctuationMapper
  62. extends Mapper<Object, Text, NullWritable, Text> {
  63.  
  64. private Text punctd = new Text();
  65. //private static fin//al String PunctuationMarks="\"\'\\[\\]\\\\!$&@~#%:;`<>(){}/!|?*-+=^,.";
  66. private static final String PunctuationMarks="\"\\[\\]\\\\!$&@~#%:;`<>(){}/!|?*-+=^,.";
  67. //DON'T
  68. public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  69. String word= value.toString();
  70. word=word.replaceAll("-", "");
  71. //word=word.replaceAll("'", "");
  72. word=word.replaceAll("["+PunctuationMarks+"]", " ");
  73. //word = word.replaceAll("[^a-zA-Z\\s+]", " ").toLowerCase();
  74. //word = curr.trim();
  75. //punctd.set(Regex.Replace(word.toString(),[^\w\d\s-]," "));
  76. //String word_punc=word.replaceAll("[^\\w\\d\\s-]", " ");
  77. //word=word.replace( "/\s\s+/g", "" );// -> 390ms
  78. punctd.set(word);
  79. context.write(new IntWritable(1), punctd);
  80. }
  81. }
  82.  
  83. public static class TrimMapper
  84. extends Mapper<Object, Text, NullWritable, Text> {
  85.  
  86. private Text trimd = new Text();
  87. public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  88. String word= value.toString().trim();
  89. //word=word.replaceAll( "/\s\s+/g", "" );// -> 390ms
  90. word=word.replaceAll("^ +| +$|( )+", "$1");
  91. trimd.set(word);
  92. context.write(new IntWritable(1), trimd);
  93. }
  94. }
  95. public static class LowerCaseMapper
  96. extends Mapper<Object, Text, NullWritable, Text> {
  97.  
  98. private Text lowercased = new Text();
  99. public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  100. lowercased.set(value.toString().toLowerCase());
  101. context.write(new IntWritable(1), lowercased);
  102. }
  103. }
  104.  
  105. public static class TokenizerMapper
  106. extends Mapper<IntWritable, Text, Text, IntWritable>{
  107.  
  108. private static final java.util.regex.Pattern WORD_BOUNDARY = java.util.regex.Pattern.compile("\\s");
  109.  
  110. private final static IntWritable one = new IntWritable(1);
  111. private Text word = new Text();
  112.  
  113. public void map(IntWritable key, Text lineText, Context context
  114. ) throws IOException, InterruptedException {
  115. String line = lineText.toString();
  116. Text currentWord = new Text();
  117. for (String word : WORD_BOUNDARY.split(line)) {
  118. if (WordCount2.isNullOrEmpty(word)) {
  119. continue;
  120. }
  121. currentWord = new Text(word);
  122. context.write(currentWord,one);
  123. }
  124. }
  125. }
  126.  
  127. public static class IntSumReducer
  128. extends Reducer<Text,IntWritable,Text,IntWritable> {
  129. private IntWritable result = new IntWritable();
  130.  
  131. public void reduce(Text key_word, Iterable<IntWritable> counts,
  132. Context context
  133. ) throws IOException, InterruptedException {
  134. int sum = 0;
  135. for (IntWritable count : counts) {
  136. sum += count.get();
  137. }
  138. result.set(sum);
  139. context.write(key_word, result);
  140. }
  141. }
  142. //////////////////////////////////////
  143. public static class TopTenMapper extends Mapper<Object, Text, NullWritable, Text>
  144. {
  145. private TreeMap<Integer, Text> topN = new TreeMap<Integer, Text>(); //Collections.reverseOrder()
  146.  
  147. private final static IntWritable one = new IntWritable(1);
  148. private Text word = new Text();
  149. public void map(Object key, Text value, Context context)
  150. throws IOException, InterruptedException {
  151. // (word, count) tuple
  152. String[] words = value.toString().split("\t") ;
  153. if (words.length < 2) {
  154. return;
  155. }
  156.  
  157. topN.put(Integer.parseInt(words[1]), new Text(value));
  158.  
  159. if (topN.size() > 10) {
  160. topN.remove(topN.firstKey());
  161. //topN.remove(topN.lastKey());
  162. }
  163. }
  164.  
  165.  
  166. @Override
  167. protected void cleanup(Context context) throws IOException,
  168. InterruptedException {
  169. for (Text t : topN.values()) {
  170. context.write(NullWritable.get(), t);
  171. }
  172. }
  173. }
  174.  
  175. public static class TopTenReducer extends
  176. Reducer<NullWritable, Text, NullWritable, Text> {
  177.  
  178. private TreeMap<Integer, Text> topN = new TreeMap<Integer, Text>();
  179.  
  180. @Override
  181. public void reduce(NullWritable key, Iterable<Text> values,
  182. Context context) throws IOException, InterruptedException {
  183. for (Text value : values) {
  184. String[] words = value.toString().split("\t") ;
  185.  
  186. topN.put(Integer.parseInt(words[1]), new Text(value));
  187.  
  188. if (topN.size() > 10) {
  189. topN.remove(topN.firstKey());
  190. }
  191. }
  192.  
  193. for (Text word : topN.descendingMap().values()) {
  194. context.write(NullWritable.get(), word);
  195. }
  196. }
  197. }
  198. //////////////////////////////////////
  199.  
  200. public int run(String[] args) throws Exception {
  201. Configuration conf = getConf();
  202. //////////////
  203. FileSystem fs = FileSystem.get(conf);
  204. Path tmpPath = new Path("/w1/tmp");
  205. fs.delete(tmpPath, true);
  206.  
  207. Path inputPath = new Path(args[0]);
  208. //Path partitionFile = new Path(args[1] + "_partitions.lst");
  209. //Path outputStage = new Path(args[1] + "_staging");
  210. Path outputStage = new Path("/w1/tmp");
  211. Path outputOrder = new Path(args[1]);
  212. //////////////
  213.  
  214. args = new GenericOptionsParser(conf, args).getRemainingArgs();
  215.  
  216. // creating a word count jobWordCount
  217. jobWordCount jobWordCount = jobWordCount.getInstance(conf,"wordcount");
  218. //jobWordCount jobWordCount = jobWordCount.getInstance(getConf(), "wordcount");
  219. //jobWordCount.setJarByClass(WordCount2.class);
  220. jobWordCount.setJarByClass(this.getClass());
  221. // Use TextInputFormat, the default unless jobWordCount.setInputFormatClass is used
  222.  
  223. //public static class PunctuationMapper
  224. //extends Mapper<Object, Text, NullWritable, Text>
  225. Configuration punctuationMapperConf = new Configuration(false);
  226. ChainMapper.addMapper(jobWordCount,
  227. PunctuationMapper.class,
  228. Object.class, Text.class,
  229. IntWritable.class, Text.class,
  230. punctuationMapperConf);
  231.  
  232. //public static class TrimMapper
  233. //extends Mapper<Object, Text, NullWritable, Text>
  234. Configuration trimMapperConf = new Configuration(false);
  235. ChainMapper.addMapper(jobWordCount,
  236. TrimMapper.class,
  237. Object.class, Text.class,
  238. IntWritable.class, Text.class,
  239. trimMapperConf);
  240.  
  241. //public static class LowerCaseMapper
  242. //extends Mapper<Object, Text, NullWritable, Text>
  243. Configuration lowerCaseMapperConf = new Configuration(false);
  244. ChainMapper.addMapper(jobWordCount,
  245. LowerCaseMapper.class,
  246. Object.class, Text.class,
  247. //IntWritable.class, Text.class,
  248. IntWritable.class, Text.class,
  249. lowerCaseMapperConf);
  250.  
  251. //public static class TokenizerMapper
  252. // extends Mapper<IntWritable, Text, Text, IntWritable>
  253. Configuration tokenizerConf = new Configuration(false);
  254. ChainMapper.addMapper(jobWordCount,
  255. TokenizerMapper.class,
  256. IntWritable.class,Text.class,
  257. Text.class, IntWritable.class,
  258. tokenizerConf);
  259.  
  260. //public static class IntSumReducer
  261. //extends Reducer<Text,IntWritable,Text,IntWritable>
  262. jobWordCount.setReducerClass(IntSumReducer.class);
  263. jobWordCount.setOutputKeyClass(Text.class);
  264. jobWordCount.setOutputValueClass(IntWritable.class);
  265.  
  266. //FileInputFormat.addInputPath(jobWordCount, new Path(args[0]));
  267. TextInputFormat.setInputPaths(jobWordCount, inputPath);
  268.  
  269. //FileOutputFormat.setOutputPath(jobWordCount, new Path(args[1]));
  270. FileOutputFormat.setOutputPath(jobWordCount, tmpPath);
  271. // Set the output format to a sequence file
  272. //jobWordCount.setOutputFormatClass(SequenceFileOutputFormat.class);
  273. //SequenceFileOutputFormat.setOutputPath(jobWordCount, outputStage);
  274.  
  275. int code = jobWordCount.waitForCompletion(true) ? 0 : 1;
  276.  
  277. if (code == 0) {
  278.  
  279. //Now that we have extracted column to sort
  280.  
  281. Job orderJob = new Job(conf, "TopWords");
  282. orderJob.setJarByClass(WordCount2.class);
  283.  
  284. // Here, use the identity mapper to output the key/value pairs in
  285. // the SequenceFile
  286. orderJob.setMapperClass(TopTenMapper.class);
  287. orderJob.setReducerClass(TopTenReducer.class);
  288. //********************
  289. //public static class TopTenMapper
  290. //extends Mapper<Object, Text, NullWritable, Text>
  291.  
  292. jobB.setMapOutputKeyClass(NullWritable.class);
  293. jobB.setMapOutputValueClass(Text.class);
  294. //********************
  295. // Set the number of reduce tasks to an appropriate number for the
  296. // amount of data being sorted
  297. orderJob.setNumReduceTasks(10);
  298. // Use Hadoop's TotalOrderPartitioner class
  299. //orderJob.setPartitionerClass(TotalOrderPartitioner.class);
  300. // Set the partition file
  301. //TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
  302. // partitionFile);
  303. //********************
  304. //public static class TopTenReducer
  305. //extends Reducer<NullWritable, Text, NullWritable, Text>
  306.  
  307. //orderJob.setOutputKeyClass(Text.class);
  308. //orderJob.setOutputValueClass(IntWritable.class);
  309. //********************
  310. orderJob.setOutputKeyClass(NullWritable.class);
  311. orderJob.setOutputValueClass(Text.class);
  312. //********************
  313. // Set the input to the previous job's output
  314. //orderJob.setInputFormatClass(SequenceFileInputFormat.class);
  315. orderJob.setInputFormatClass(KeyValueTextInputFormat.class);
  316. orderJob.setOutputFormatClass(TextOutputFormat.class);
  317.  
  318. //SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
  319. // Set the output path to the command line parameter
  320. TextOutputFormat.setOutputPath(orderJob, outputOrder);
  321. // Set the separator to an empty string
  322. //orderJob.getConfiguration().set(
  323. // "mapred.textoutputformat.separator", "");
  324. // Use the InputSampler to go through the output of the previous
  325. // job, sample it, and create the partition file
  326. //InputSampler.writePartitionFile(orderJob,
  327. // new InputSampler.RandomSampler(.1, 10000));
  328. FileInputFormat.setInputPaths(orderJob, tmpPath);
  329. FileOutputFormat.setOutputPath(orderJob, new Path(args[1]));
  330.  
  331. // Submit the job
  332. code = orderJob.waitForCompletion(true) ? 0 : 2;
  333. }
  334.  
  335. // Clean up the partition file and the staging directory
  336. // FileSystem.get(new Configuration()).delete(partitionFile, false);
  337. // FileSystem.get(new Configuration()).delete(outputStage, true);
  338. System.exit(code);
  339.  
  340. return (jobWordCount.waitForCompletion(true) ? 0 : 1);
  341. }
  342. }
  343.  
Success #stdin #stdout #stderr 0.01s 5552KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Error: near line 1: near "/": syntax error
Error: near line 18: near "import": syntax error
Error: near line 20: near "import": syntax error
Error: near line 21: near "import": syntax error
Error: near line 22: near "import": syntax error
Error: near line 23: near "import": syntax error
Error: near line 24: near "import": syntax error
Error: near line 25: near "import": syntax error
Error: near line 26: near "import": syntax error
Error: near line 27: near "import": syntax error
Error: near line 28: near "import": syntax error
Error: near line 29: near "import": syntax error
Error: near line 30: near "import": syntax error
Error: near line 31: near "import": syntax error
Error: near line 32: near "import": syntax error
Error: near line 33: near "import": syntax error
Error: near line 34: near "import": syntax error
Error: near line 37: near "import": syntax error
Error: near line 38: near "import": syntax error
Error: near line 39: near "import": syntax error
Error: near line 40: near "import": syntax error
Error: near line 41: near "import": syntax error
Error: near line 42: near "import": syntax error
Error: near line 43: near "import": syntax error
Error: near line 45: near "public": syntax error
Error: near line 49: near "public": syntax error
Error: near line 51: near "System": syntax error
Error: near line 52: unrecognized token: "}"
Error: near line 57: near "return": syntax error
Error: near line 58: unrecognized token: "}"
Error: near line 65: near "/": syntax error
Error: near line 70: near "word": syntax error
Error: near line 71: near "/": syntax error
Error: near line 72: near "word": syntax error
Error: near line 73: near "/": syntax error
Error: near line 74: near "/": syntax error
Error: near line 75: near "/": syntax error
Error: near line 76: near "/": syntax error
Error: near line 77: near "/": syntax error
Error: near line 79: near "context": syntax error
Error: near line 80: unrecognized token: "}"
Error: near line 87: near "public": syntax error
Error: near line 89: near "/": syntax error
Error: near line 91: near "trimd": syntax error
Error: near line 92: near "context": syntax error
Error: near line 93: unrecognized token: "}"
Error: near line 99: near "public": syntax error
Error: near line 101: near "context": syntax error
Error: near line 102: unrecognized token: "}"
Error: near line 110: near "private": syntax error
Error: near line 111: near "private": syntax error
Error: near line 113: near "public": syntax error
Error: near line 116: near "Text": syntax error
Error: near line 117: near "for": syntax error
Error: near line 120: unrecognized token: "}"
Error: near line 122: near "context": syntax error
Error: near line 123: unrecognized token: "}"
Error: near line 131: near "public": syntax error
Error: near line 135: near "for": syntax error
Error: near line 137: unrecognized token: "}"
Error: near line 139: near "context": syntax error
Error: near line 140: unrecognized token: "}"
Error: near line 148: near "private": syntax error
Error: near line 149: near "public": syntax error
Error: near line 153: near "if": syntax error
Error: near line 155: unrecognized token: "}"
Error: near line 159: near "if": syntax error
Error: near line 161: near "/": syntax error
Error: near line 162: unrecognized token: "}"
Error: near line 171: unrecognized token: "}"
Error: near line 180: near "@Override": syntax error
Error: near line 186: near "topN": syntax error
Error: near line 188: near "if": syntax error
Error: near line 190: unrecognized token: "}"
Error: near line 195: unrecognized token: "}"
Error: near line 202: near "/": syntax error
Error: near line 204: near "Path": syntax error
Error: near line 205: near "fs": syntax error
Error: near line 207: near "Path": syntax error
Error: near line 208: near "/": syntax error
Error: near line 209: near "/": syntax error
Error: near line 210: near "Path": syntax error
Error: near line 211: near "Path": syntax error
Error: near line 212: near "/": syntax error
Error: near line 216: near "/": syntax error
Error: near line 218: near "/": syntax error
Error: near line 219: near "/": syntax error
Error: near line 220: near "jobWordCount": syntax error
Error: near line 221: near "/": syntax error
Error: near line 226: near "ChainMapper": syntax error
Error: near line 232: near "/": syntax error
Error: near line 235: near "ChainMapper": syntax error
Error: near line 241: near "/": syntax error
Error: near line 244: near "ChainMapper": syntax error
Error: near line 251: near "/": syntax error
Error: near line 254: near "ChainMapper": syntax error
Error: near line 260: near "/": syntax error
Error: near line 263: near "jobWordCount": syntax error
Error: near line 264: near "jobWordCount": syntax error
Error: near line 266: near "/": syntax error
Error: near line 267: near "TextInputFormat": syntax error
Error: near line 269: near "/": syntax error
Error: near line 270: near "FileOutputFormat": syntax error
Error: near line 271: near "/": syntax error
Error: near line 273: near "/": syntax error
Error: near line 275: near "int": syntax error
Error: near line 277: near "if": syntax error
Error: near line 282: near "orderJob": syntax error
Error: near line 284: near "/": syntax error
Error: near line 287: near "orderJob": syntax error
Error: near line 288: near "/": syntax error