Listing 6. Sample MapReduce Program to Test LZO in a Hadoop Cluster /** * MapReduce Word count Sample * Input File ~V LZO compressed file * Run com.hadoop.compression.lzo.LZOIndexer / * com.hadoop.compression.lzo. * DistributedLZOIndexer to create .lzo.index file to further * improve the read speed of LZO compressed files. * If the input lzo files are indexed, the input format will take * advantage of it. The input file/directory is taken as the first * argument. The output directory is taken as the second argument. * Uses NullWritable for efficiency. * * Usage: hadoop jar path/to/this.jar */ public class SimpleLZOWC extends Configured implements Tool { private SimpleLZOWC () {} public static class LzoWordCountMapper extends Mapper { private final LongWritable one = new LongWritable(1L); private final Text word = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); context.write(word, one); } } } public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJobName("Simple LZO Word Count"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setJarByClass(getClass()); job.setMapperClass(LzoWordCountMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); // Use the custom LzoTextInputFormat class. job.setInputFormatClass(LzoTextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new LzoWordCount(), args); System.exit(exitCode); } }