BAEL-1829 - Overview of Apache Crunch

This commit is contained in:
chandra
2018-08-26 22:07:55 -04:00
parent 858ce0ec47
commit 813bc3249f
14 changed files with 466 additions and 1 deletions
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id>job</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<unpack>false</unpack>
<scope>runtime</scope>
<outputDirectory>lib</outputDirectory>
<excludes>
<exclude>${groupId}:${artifactId}</exclude>
</excludes>
</dependencySet>
<dependencySet>
<unpack>true</unpack>
<includes>
<include>${groupId}:${artifactId}</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>
@@ -0,0 +1,25 @@
package com.baeldung.crunch;
import java.util.Set;
import org.apache.crunch.FilterFn;
import com.google.common.collect.ImmutableSet;
/**
* A filter that removes known stop words.
*/
public class StopWordFilter extends FilterFn<String> {
// English stop words, borrowed from Lucene.
private static final Set<String> STOP_WORDS = ImmutableSet
.copyOf(new String[] { "a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it", "no", "not", "of", "on",
"or", "s", "such", "t", "that", "the", "their", "then", "there",
"these", "they", "this", "to", "was", "will", "with" });
@Override
public boolean accept(String word) {
return !STOP_WORDS.contains(word);
}
}
@@ -0,0 +1,11 @@
package com.baeldung.crunch;
import org.apache.crunch.MapFn;
public class ToUpperCaseFn extends MapFn<String, String> {
@Override
public String map(String input) {
return input != null ? input.toUpperCase() : input;
}
}
@@ -0,0 +1,20 @@
package com.baeldung.crunch;
import org.apache.crunch.MapFn;
@SuppressWarnings("serial")
public class ToUpperCaseWithCounterFn extends MapFn<String, String> {
@Override
public String map(String input) {
if (input == null) {
return input;
} else {
String output = input.toUpperCase();
if (!input.equals(output)) {
increment("UpperCase", "modified");
}
return output;
}
}
}
@@ -0,0 +1,23 @@
package com.baeldung.crunch;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import com.google.common.base.Splitter;
/**
* Splits a line of text, filtering known stop words.
*/
public class Tokenizer extends DoFn<String, String> {
private static final Splitter SPLITTER = Splitter
.onPattern("\\s+")
.omitEmptyStrings();
@Override
public void process(String line,
Emitter<String> emitter) {
for (String word : SPLITTER.split(line)) {
emitter.emit(word);
}
}
}
@@ -0,0 +1,62 @@
package com.baeldung.crunch;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pipeline;
import org.apache.crunch.PipelineResult;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* A word count example for Apache Crunch, based on Crunch's example projects.
*/
public class WordCount extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new WordCount(), args);
}
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
String inputPath = args[0];
String outputPath = args[1];
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(inputPath);
// Define a function that splits each line in a PCollection of Strings into
// a PCollection made up of the individual words in the file.
// The second argument sets the serialization format.
PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());
// Take the collection of words and remove known stop words.
PCollection<String> noStopWords = words.filter(new StopWordFilter());
// The count method applies a series of Crunch primitives and returns
// a map of the unique words in the input PCollection to their counts.
PTable<String, Long> counts = noStopWords.count();
// Instruct the pipeline to write the resulting counts to a text file.
pipeline.writeTextFile(counts, outputPath);
// Execute the pipeline as a MapReduce.
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
}