Listing 6. Sample MapReduce Program to Test LZO in a Hadoop Cluster

/**
 * MapReduce Word count Sample
 * Input File ~V LZO compressed file
 * Run com.hadoop.compression.lzo.LZOIndexer /
 * com.hadoop.compression.lzo.
 * DistributedLZOIndexer to create .lzo.index file to further
 * improve the read speed of LZO compressed files.
 * If the input lzo files are indexed, the input format will take
 * advantage of it. The input file/directory is taken as the first
 * argument. The output directory is taken as the second argument.
 * Uses NullWritable for efficiency.
 *
 * Usage: hadoop jar path/to/this.jar <input-dir> <output-dir>
 */
public class SimpleLZOWC extends Configured implements Tool {

  private SimpleLZOWC () {}

  public static class LzoWordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    private final LongWritable one = new LongWritable(1L);
    private final Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
      String line = value.toString();
      StringTokenizer tokenizer = new StringTokenizer(line);
      while (tokenizer.hasMoreTokens()) {
        word.set(tokenizer.nextToken());
        context.write(word, one);
      }
    }
  }

  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJobName("Simple LZO Word Count");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setJarByClass(getClass());
    job.setMapperClass(LzoWordCountMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    // Use the custom LzoTextInputFormat class.
    job.setInputFormatClass(LzoTextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new LzoWordCount(), args);
    System.exit(exitCode);
  }
}