Listing 7. Sample Program to Test LZO in Pig /** * Pig Word count Sample * Input File ~V LZO compressed file * Run com.hadoop.compression.lzo.LZOIndexer / * com.hadoop.compression.lzo. * DistributedLZOIndexer to create .lzo.index file to further * improve the read speed of LZO compressed files. * Output - output directory is taken as the second argument. * * To generate data for use with this word counter, run lzop * on a data file * Usage: PigLzoTest */ public class PigLzoTest { /** * @param args */ public static void main(String[] args) { try { PigServer pigServer = new PigServer("mapreduce"); pigServer.registerJar("lib/elephant-bird-2.0.jar"); pigServer.registerJar("lzotest.jar"); runWordCountQuery(pigServer, args[0], args[1]); } catch (IOException e) { e.printStackTrace(); } } /** * Pig Script for Word Count * @param pigServer * @throws IOException */ public static void runWordCountQuery(PigServer pigServer, String inputFile, String outputFile) throws IOException { pigServer.registerQuery("A = load '" + inputFile + "';"); pigServer.registerQuery("B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;"); pigServer.registerQuery("C = filter B by word matches '\\w+';"); pigServer.registerQuery("D = group C by word;"); pigServer.registerQuery("E = foreach D generate group as word, COUNT(C) as count;"); pigServer.registerQuery("F = order E by count desc;"); pigServer.registerQuery("store F into '" + outputFile + "' using com.hadoop.compression.lzo.LzoTextStorer();"); } }