BAEL-1829 - Overview of Apache Crunch
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
|
||||
<id>job</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<unpack>false</unpack>
|
||||
<scope>runtime</scope>
|
||||
<outputDirectory>lib</outputDirectory>
|
||||
<excludes>
|
||||
<exclude>${groupId}:${artifactId}</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
<dependencySet>
|
||||
<unpack>true</unpack>
|
||||
<includes>
|
||||
<include>${groupId}:${artifactId}</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.baeldung.crunch;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.crunch.FilterFn;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
|
||||
/**
|
||||
* A filter that removes known stop words.
|
||||
*/
|
||||
public class StopWordFilter extends FilterFn<String> {
|
||||
|
||||
// English stop words, borrowed from Lucene.
|
||||
private static final Set<String> STOP_WORDS = ImmutableSet
|
||||
.copyOf(new String[] { "a", "and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it", "no", "not", "of", "on",
|
||||
"or", "s", "such", "t", "that", "the", "their", "then", "there",
|
||||
"these", "they", "this", "to", "was", "will", "with" });
|
||||
|
||||
@Override
|
||||
public boolean accept(String word) {
|
||||
return !STOP_WORDS.contains(word);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.baeldung.crunch;
|
||||
|
||||
import org.apache.crunch.MapFn;
|
||||
|
||||
public class ToUpperCaseFn extends MapFn<String, String> {
|
||||
|
||||
@Override
|
||||
public String map(String input) {
|
||||
return input != null ? input.toUpperCase() : input;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.baeldung.crunch;
|
||||
|
||||
import org.apache.crunch.MapFn;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
public class ToUpperCaseWithCounterFn extends MapFn<String, String> {
|
||||
|
||||
@Override
|
||||
public String map(String input) {
|
||||
if (input == null) {
|
||||
return input;
|
||||
} else {
|
||||
String output = input.toUpperCase();
|
||||
if (!input.equals(output)) {
|
||||
increment("UpperCase", "modified");
|
||||
}
|
||||
return output;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.baeldung.crunch;
|
||||
|
||||
import org.apache.crunch.DoFn;
|
||||
import org.apache.crunch.Emitter;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
|
||||
/**
|
||||
* Splits a line of text, filtering known stop words.
|
||||
*/
|
||||
public class Tokenizer extends DoFn<String, String> {
|
||||
private static final Splitter SPLITTER = Splitter
|
||||
.onPattern("\\s+")
|
||||
.omitEmptyStrings();
|
||||
|
||||
@Override
|
||||
public void process(String line,
|
||||
Emitter<String> emitter) {
|
||||
for (String word : SPLITTER.split(line)) {
|
||||
emitter.emit(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package com.baeldung.crunch;
|
||||
|
||||
import org.apache.crunch.PCollection;
|
||||
import org.apache.crunch.PTable;
|
||||
import org.apache.crunch.Pipeline;
|
||||
import org.apache.crunch.PipelineResult;
|
||||
import org.apache.crunch.impl.mr.MRPipeline;
|
||||
import org.apache.crunch.types.writable.Writables;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
/**
|
||||
* A word count example for Apache Crunch, based on Crunch's example projects.
|
||||
*/
|
||||
public class WordCount extends Configured implements Tool {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ToolRunner.run(new Configuration(), new WordCount(), args);
|
||||
}
|
||||
|
||||
public int run(String[] args) throws Exception {
|
||||
|
||||
if (args.length != 2) {
|
||||
System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
|
||||
System.err.println();
|
||||
GenericOptionsParser.printGenericCommandUsage(System.err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
String inputPath = args[0];
|
||||
String outputPath = args[1];
|
||||
|
||||
// Create an object to coordinate pipeline creation and execution.
|
||||
Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
|
||||
|
||||
// Reference a given text file as a collection of Strings.
|
||||
PCollection<String> lines = pipeline.readTextFile(inputPath);
|
||||
|
||||
// Define a function that splits each line in a PCollection of Strings into
|
||||
// a PCollection made up of the individual words in the file.
|
||||
// The second argument sets the serialization format.
|
||||
PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());
|
||||
|
||||
// Take the collection of words and remove known stop words.
|
||||
PCollection<String> noStopWords = words.filter(new StopWordFilter());
|
||||
|
||||
// The count method applies a series of Crunch primitives and returns
|
||||
// a map of the unique words in the input PCollection to their counts.
|
||||
PTable<String, Long> counts = noStopWords.count();
|
||||
|
||||
// Instruct the pipeline to write the resulting counts to a text file.
|
||||
pipeline.writeTextFile(counts, outputPath);
|
||||
|
||||
// Execute the pipeline as a MapReduce.
|
||||
PipelineResult result = pipeline.done();
|
||||
|
||||
return result.succeeded() ? 0 : 1;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user