BAEL-1829 - Overview of Apache Crunch

2018-08-26 22:07:55 -04:00
parent 858ce0ec47
commit 813bc3249f
14 changed files with 466 additions and 1 deletions
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+
+  <id>job</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>false</unpack>
+      <scope>runtime</scope>
+      <outputDirectory>lib</outputDirectory>
+      <excludes>
+        <exclude>${groupId}:${artifactId}</exclude>
+      </excludes>
+    </dependencySet>
+    <dependencySet>
+      <unpack>true</unpack>
+      <includes>
+        <include>${groupId}:${artifactId}</include>
+      </includes>
+    </dependencySet>
+  </dependencySets>
+</assembly>
@@ -0,0 +1,25 @@
+package com.baeldung.crunch;
+
+import java.util.Set;
+
+import org.apache.crunch.FilterFn;
+
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * A filter that removes known stop words.
+ */
+public class StopWordFilter extends FilterFn<String> {
+
+    // English stop words, borrowed from Lucene.
+    private static final Set<String> STOP_WORDS = ImmutableSet
+        .copyOf(new String[] { "a", "and", "are", "as", "at", "be", "but", "by",
+                "for", "if", "in", "into", "is", "it", "no", "not", "of", "on",
+                "or", "s", "such", "t", "that", "the", "their", "then", "there",
+                "these", "they", "this", "to", "was", "will", "with" });
+
+    @Override
+    public boolean accept(String word) {
+        return !STOP_WORDS.contains(word);
+    }
+}
@@ -0,0 +1,11 @@
+package com.baeldung.crunch;
+
+import org.apache.crunch.MapFn;
+
+public class ToUpperCaseFn extends MapFn<String, String> {
+
+    @Override
+    public String map(String input) {
+        return input != null ? input.toUpperCase() : input;
+    }
+}
@@ -0,0 +1,20 @@
+package com.baeldung.crunch;
+
+import org.apache.crunch.MapFn;
+
+@SuppressWarnings("serial")
+public class ToUpperCaseWithCounterFn extends MapFn<String, String> {
+
+    @Override
+    public String map(String input) {
+        if (input == null) {
+            return input;
+        } else {
+            String output = input.toUpperCase();
+            if (!input.equals(output)) {
+                increment("UpperCase", "modified");
+            }
+            return output;
+        }
+    }
+}
@@ -0,0 +1,23 @@
+package com.baeldung.crunch;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+
+import com.google.common.base.Splitter;
+
+/**
+ * Splits a line of text, filtering known stop words.
+ */
+public class Tokenizer extends DoFn<String, String> {
+    private static final Splitter SPLITTER = Splitter
+        .onPattern("\\s+")
+        .omitEmptyStrings();
+
+    @Override
+    public void process(String line,
+        Emitter<String> emitter) {
+        for (String word : SPLITTER.split(line)) {
+            emitter.emit(word);
+        }
+    }
+}
@@ -0,0 +1,62 @@
+package com.baeldung.crunch;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * A word count example for Apache Crunch, based on Crunch's example projects.
+ */
+public class WordCount extends Configured implements Tool {
+
+    public static void main(String[] args) throws Exception {
+        ToolRunner.run(new Configuration(), new WordCount(), args);
+    }
+
+    public int run(String[] args) throws Exception {
+
+        if (args.length != 2) {
+            System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
+            System.err.println();
+            GenericOptionsParser.printGenericCommandUsage(System.err);
+            return 1;
+        }
+
+        String inputPath = args[0];
+        String outputPath = args[1];
+
+        // Create an object to coordinate pipeline creation and execution.
+        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
+
+        // Reference a given text file as a collection of Strings.
+        PCollection<String> lines = pipeline.readTextFile(inputPath);
+
+        // Define a function that splits each line in a PCollection of Strings into
+        // a PCollection made up of the individual words in the file.
+        // The second argument sets the serialization format.
+        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());
+
+        // Take the collection of words and remove known stop words.
+        PCollection<String> noStopWords = words.filter(new StopWordFilter());
+
+        // The count method applies a series of Crunch primitives and returns
+        // a map of the unique words in the input PCollection to their counts.
+        PTable<String, Long> counts = noStopWords.count();
+
+        // Instruct the pipeline to write the resulting counts to a text file.
+        pipeline.writeTextFile(counts, outputPath);
+
+        // Execute the pipeline as a MapReduce.
+        PipelineResult result = pipeline.done();
+
+        return result.succeeded() ? 0 : 1;
+    }
+}