Spark graphx moved to a new project due to a incompatibility of scala compiler

This commit is contained in:
Norberto Ritzmann Jr
2019-10-06 21:57:54 +02:00
parent db85c8f275
commit 45be61376e
20299 changed files with 1635696 additions and 0 deletions
@@ -0,0 +1,40 @@
package com.baeldung;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class WordCount {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: JavaWordCount <file>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
.setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = ctx.textFile(args[0], 1);
JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
List<Tuple2<String, Integer>> output = wordWithCount.collect();
for (Tuple2<?, ?> tuple : output) {
System.out.println(tuple._1() + ": " + tuple._2());
}
ctx.stop();
}
}
@@ -0,0 +1,25 @@
package com.baeldung.data.pipeline;
import java.io.Serializable;
public class Word implements Serializable {
private static final long serialVersionUID = 1L;
private String word;
private int count;
Word(String word, int count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
}
@@ -0,0 +1,80 @@
package com.baeldung.data.pipeline;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.javaFunctions;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import scala.Tuple2;
public class WordCountingApp {
public static void main(String[] args) throws InterruptedException {
Logger.getLogger("org")
.setLevel(Level.OFF);
Logger.getLogger("akka")
.setLevel(Level.OFF);
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("messages");
SparkConf sparkConf = new SparkConf();
sparkConf.setMaster("local[2]");
sparkConf.setAppName("WordCountingApp");
sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));
JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));
JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());
JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
.iterator());
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
.reduceByKey((i1, i2) -> i1 + i2);
wordCounts.foreachRDD(javaRdd -> {
Map<String, Integer> wordCountMap = javaRdd.collectAsMap();
for (String key : wordCountMap.keySet()) {
List<Word> wordList = Arrays.asList(new Word(key, wordCountMap.get(key)));
JavaRDD<Word> rdd = streamingContext.sparkContext()
.parallelize(wordList);
javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
.saveToCassandra();
}
});
streamingContext.start();
streamingContext.awaitTermination();
}
}
@@ -0,0 +1,97 @@
package com.baeldung.data.pipeline;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.javaFunctions;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.StateSpec;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import scala.Tuple2;
public class WordCountingAppWithCheckpoint {
public static JavaSparkContext sparkContext;
public static void main(String[] args) throws InterruptedException {
Logger.getLogger("org")
.setLevel(Level.OFF);
Logger.getLogger("akka")
.setLevel(Level.OFF);
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("messages");
SparkConf sparkConf = new SparkConf();
sparkConf.setMaster("local[2]");
sparkConf.setAppName("WordCountingAppWithCheckpoint");
sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
sparkContext = streamingContext.sparkContext();
streamingContext.checkpoint("./.checkpoint");
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));
JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));
JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());
JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
.iterator());
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
.reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);
JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
Tuple2<String, Integer> output = new Tuple2<>(word, sum);
state.update(sum);
return output;
}));
cumulativeWordCounts.foreachRDD(javaRdd -> {
List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
for (Tuple2<String, Integer> tuple : wordCountList) {
List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
.saveToCassandra();
}
});
streamingContext.start();
streamingContext.awaitTermination();
}
}
@@ -0,0 +1,111 @@
package com.baeldung.ml;
import java.util.HashMap;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.mllib.classification.LogisticRegressionModel;
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
import scala.Tuple2;
public class MachineLearningApp {
public static void main(String[] args) {
// 1. Setting the Spark Context
SparkConf conf = new SparkConf().setAppName("Main")
.setMaster("local[2]")
.set("spark.executor.memory", "3g")
.set("spark.driver.memory", "3g");
JavaSparkContext sc = new JavaSparkContext(conf);
Logger.getLogger("org")
.setLevel(Level.OFF);
Logger.getLogger("akka")
.setLevel(Level.OFF);
// 2. Loading the Data-set
String dataFile = "data\\iris.data";
JavaRDD<String> data = sc.textFile(dataFile);
// 3. Exploratory Data Analysis
// 3.1. Creating Vector of Input Data
JavaRDD<Vector> inputData = data.map(line -> {
String[] parts = line.split(",");
double[] v = new double[parts.length - 1];
for (int i = 0; i < parts.length - 1; i++) {
v[i] = Double.parseDouble(parts[i]);
}
return Vectors.dense(v);
});
// 3.2. Performing Statistical Analysis
MultivariateStatisticalSummary summary = Statistics.colStats(inputData.rdd());
System.out.println("Summary Mean:");
System.out.println(summary.mean());
System.out.println("Summary Variance:");
System.out.println(summary.variance());
System.out.println("Summary Non-zero:");
System.out.println(summary.numNonzeros());
// 3.3. Performing Correlation Analysis
Matrix correlMatrix = Statistics.corr(inputData.rdd(), "pearson");
System.out.println("Correlation Matrix:");
System.out.println(correlMatrix.toString());
// 4. Data Preparation
// 4.1. Creating Map for Textual Output Labels
Map<String, Integer> map = new HashMap<String, Integer>();
map.put("Iris-setosa", 0);
map.put("Iris-versicolor", 1);
map.put("Iris-virginica", 2);
// 4.2. Creating LabeledPoint of Input and Output Data
JavaRDD<LabeledPoint> parsedData = data.map(line -> {
String[] parts = line.split(",");
double[] v = new double[parts.length - 1];
for (int i = 0; i < parts.length - 1; i++) {
v[i] = Double.parseDouble(parts[i]);
}
return new LabeledPoint(map.get(parts[parts.length - 1]), Vectors.dense(v));
});
// 5. Data Splitting into 80% Training and 20% Test Sets
JavaRDD<LabeledPoint>[] splits = parsedData.randomSplit(new double[] { 0.8, 0.2 }, 11L);
JavaRDD<LabeledPoint> trainingData = splits[0].cache();
JavaRDD<LabeledPoint> testData = splits[1];
// 6. Modeling
// 6.1. Model Training
LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(3)
.run(trainingData.rdd());
// 6.2. Model Evaluation
JavaPairRDD<Object, Object> predictionAndLabels = testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label()));
MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
double accuracy = metrics.accuracy();
System.out.println("Model Accuracy on Test Data: " + accuracy);
// 7. Model Saving and Loading
// 7.1. Model Saving
model.save(sc.sc(), "model\\logistic-regression");
// 7.2. Model Loading
LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc.sc(), "model\\logistic-regression");
// 7.3. Prediction on New Data
Vector newData = Vectors.dense(new double[] { 1, 1, 1, 1 });
double prediction = sameModel.predict(newData);
System.out.println("Model Prediction on New Data = " + prediction);
// 8. Clean-up
sc.close();
}
}