Listing 7. Sample Program to Test LZO in Pig

/**
 * Pig Word count Sample
 * Input File ~V LZO compressed file
 * Run com.hadoop.compression.lzo.LZOIndexer /
 * com.hadoop.compression.lzo.
 * DistributedLZOIndexer to create .lzo.index file to further
 * improve the read speed of LZO compressed files.
 * Output - output directory is taken as the second argument.
 *
 * To generate data for use with this word counter, run lzop
 * on a data file
 * Usage: PigLzoTest <input-file> <output-folder>
 */
public class PigLzoTest {

    /**
     * @param args
     */
    public static void main(String[] args) {
            try {
                     PigServer pigServer = new PigServer("mapreduce");
                     pigServer.registerJar("lib/elephant-bird-2.0.jar");
                     pigServer.registerJar("lzotest.jar");

                     runWordCountQuery(pigServer, args[0], args[1]);
          } catch (IOException e) {
                     e.printStackTrace();
          }
    }

    /**
     * Pig Script for Word Count
     * @param pigServer
     * @throws IOException
     */
    public static void runWordCountQuery(PigServer pigServer, String inputFile, String outputFile) throws IOException {
        pigServer.registerQuery("A = load '" + inputFile + "';");
        pigServer.registerQuery("B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;");
        pigServer.registerQuery("C = filter B by word matches '\\w+';");
        pigServer.registerQuery("D = group C by word;");
        pigServer.registerQuery("E = foreach D generate group as word, COUNT(C) as count;");
        pigServer.registerQuery("F = order E by count desc;");

        pigServer.registerQuery("store F into '" + outputFile + "' using com.hadoop.compression.lzo.LzoTextStorer();");
    }
}