diff --git a/libraries/OpenNLP/PartOfSpeechTag.txt b/libraries/OpenNLP/PartOfSpeechTag.txt new file mode 100644 index 0000000000..fdd8238ec4 --- /dev/null +++ b/libraries/OpenNLP/PartOfSpeechTag.txt @@ -0,0 +1 @@ +Out of the night that covers me \ No newline at end of file diff --git a/libraries/OpenNLP/doc-cat.train b/libraries/OpenNLP/doc-cat.train new file mode 100644 index 0000000000..c457221ec6 --- /dev/null +++ b/libraries/OpenNLP/doc-cat.train @@ -0,0 +1,10 @@ +GOOD good morning / +GOOD good evening / +GOOD have a good day / +GOOD nice party! / +GOOD fine pants / +BAD nightmare volcano in the sea / +BAD darkest sky / +BAD greed and waste / +BAD army attacks / +BAD bomb explodes / \ No newline at end of file diff --git a/libraries/OpenNLP/en-chunker.bin b/libraries/OpenNLP/en-chunker.bin new file mode 100644 index 0000000000..65d9356888 Binary files /dev/null and b/libraries/OpenNLP/en-chunker.bin differ diff --git a/libraries/OpenNLP/en-ner-location.bin b/libraries/OpenNLP/en-ner-location.bin new file mode 100644 index 0000000000..f3788bc1f6 Binary files /dev/null and b/libraries/OpenNLP/en-ner-location.bin differ diff --git a/libraries/OpenNLP/en-ner-person.bin b/libraries/OpenNLP/en-ner-person.bin new file mode 100644 index 0000000000..2f68318203 Binary files /dev/null and b/libraries/OpenNLP/en-ner-person.bin differ diff --git a/libraries/OpenNLP/en-pos-maxent.bin b/libraries/OpenNLP/en-pos-maxent.bin new file mode 100644 index 0000000000..c8cae23c5f Binary files /dev/null and b/libraries/OpenNLP/en-pos-maxent.bin differ diff --git a/libraries/OpenNLP/en-sent.bin b/libraries/OpenNLP/en-sent.bin new file mode 100644 index 0000000000..e89076be5a Binary files /dev/null and b/libraries/OpenNLP/en-sent.bin differ diff --git a/libraries/OpenNLP/en-token.bin b/libraries/OpenNLP/en-token.bin new file mode 100644 index 0000000000..c417277ca7 Binary files /dev/null and b/libraries/OpenNLP/en-token.bin differ diff --git a/libraries/pom.xml b/libraries/pom.xml index 653f21d1f6..bc40514b2f 100644 --- a/libraries/pom.xml +++ b/libraries/pom.xml @@ -323,6 +323,13 @@ netty-all ${netty.version} + + + org.apache.opennlp + opennlp-tools + 1.8.0 + + 0.7.0 @@ -350,4 +357,4 @@ 4.1.10.Final - + \ No newline at end of file diff --git a/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java b/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java new file mode 100644 index 0000000000..b2fa8e629b --- /dev/null +++ b/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java @@ -0,0 +1,166 @@ +package com.baeldung.opennlp; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.logging.Logger; + +import opennlp.tools.chunker.ChunkerME; +import opennlp.tools.chunker.ChunkerModel; +import opennlp.tools.cmdline.postag.POSModelLoader; +import opennlp.tools.doccat.DoccatFactory; +import opennlp.tools.doccat.DoccatModel; +import opennlp.tools.doccat.DocumentCategorizerME; +import opennlp.tools.doccat.DocumentSample; +import opennlp.tools.doccat.DocumentSampleStream; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSSample; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.sentdetect.SentenceDetectorME; +import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.Span; +import opennlp.tools.util.TrainingParameters; + +public class OpenNLP { + + private final static Logger LOGGER = Logger.getLogger(OpenNLP.class.getName()); + private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south."; + private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." }; + + private DoccatModel docCatModel; + + public static void main(String[] args) { + new OpenNLP(); + } + + public OpenNLP() { + try { + sentenceDetector(); + tokenizer(); + nameFinder(); + locationFinder(); + trainDocumentCategorizer(); + documentCategorizer(); + partOfSpeechTagger(); + chunker(); + } catch (InvalidFormatException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void sentenceDetector() throws InvalidFormatException, IOException { + + InputStream is = new FileInputStream("OpenNLP/en-sent.bin"); + SentenceModel model = new SentenceModel(is); + SentenceDetectorME sdetector = new SentenceDetectorME(model); + String sentences[] = sdetector.sentDetect(text); + Arrays.stream(sentences).forEach(LOGGER::info); + is.close(); + } + + public void tokenizer() throws InvalidFormatException, IOException { + InputStream is = new FileInputStream("OpenNLP/en-token.bin"); + TokenizerModel model = new TokenizerModel(is); + Tokenizer tokenizer = new TokenizerME(model); + String tokens[] = tokenizer.tokenize(text); + Arrays.stream(tokens).forEach(LOGGER::info); + is.close(); + } + + public static void nameFinder() throws IOException { + InputStream is = new FileInputStream("OpenNLP/en-ner-person.bin"); + TokenNameFinderModel model = new TokenNameFinderModel(is); + is.close(); + NameFinderME nameFinder = new NameFinderME(model); + Span nameSpans[] = nameFinder.find(sentence); + String[] names = Span.spansToStrings(nameSpans, sentence); + Arrays.stream(names).forEach(LOGGER::info); + } + + public static void locationFinder() throws IOException { + InputStream is = new FileInputStream("OpenNLP/en-ner-location.bin"); + TokenNameFinderModel model = new TokenNameFinderModel(is); + is.close(); + NameFinderME nameFinder = new NameFinderME(model); + Span locationSpans[] = nameFinder.find(sentence); + String[] locations = Span.spansToStrings(locationSpans, sentence); + Arrays.stream(locations).forEach(LOGGER::info); + } + + public void trainDocumentCategorizer() { + + try { + InputStreamFactory isf = new InputStreamFactory() { + public InputStream createInputStream() throws IOException { + return new FileInputStream("OpenNLP/doc-cat.train"); + } + }; + ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); + ObjectStream sampleStream = new DocumentSampleStream(lineStream); + DoccatFactory docCatFactory = new DoccatFactory(); + docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void documentCategorizer() { + DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel); + double[] outcomes = myCategorizer.categorize(sentence); + String category = myCategorizer.getBestCategory(outcomes); + + if (category.equalsIgnoreCase("GOOD")) { + LOGGER.info("Document is positive :) "); + } else { + LOGGER.info("Document is negative :( "); + } + } + + public static void partOfSpeechTagger() throws IOException { + try { + POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin")); + POSTaggerME posTaggerME = new POSTaggerME(posModel); + InputStreamFactory isf = new InputStreamFactory() { + public InputStream createInputStream() throws IOException { + return new FileInputStream("OpenNLP/PartOfSpeechTag.txt"); + } + }; + ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); + String line; + while ((line = lineStream.read()) != null) { + String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line); + String[] tags = posTaggerME.tag(whitespaceTokenizerLine); + POSSample posSample = new POSSample(whitespaceTokenizerLine, tags); + LOGGER.info(posSample.toString()); + } + lineStream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void chunker() throws IOException { + InputStream is = new FileInputStream("OpenNLP/en-chunker.bin"); + ChunkerModel cModel = new ChunkerModel(is); + ChunkerME chunkerME = new ChunkerME(cModel); + String[] taggedSentence = new String[] {"Out", "of", "the", "night", "that", "covers", "me"}; + String pos[] = new String[] { "IN", "IN", "DT", "NN", "WDT", "VBZ", "PRP"}; + String chunks[] = chunkerME.chunk(taggedSentence, pos); + Arrays.stream(chunks).forEach(LOGGER::info); + } + +} diff --git a/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java b/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java new file mode 100644 index 0000000000..a38791fd61 --- /dev/null +++ b/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java @@ -0,0 +1,158 @@ +package com.baeldung.opennlp; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.junit.Test; + +import opennlp.tools.chunker.ChunkerME; +import opennlp.tools.chunker.ChunkerModel; +import opennlp.tools.cmdline.postag.POSModelLoader; +import opennlp.tools.doccat.DoccatFactory; +import opennlp.tools.doccat.DoccatModel; +import opennlp.tools.doccat.DocumentCategorizerME; +import opennlp.tools.doccat.DocumentSample; +import opennlp.tools.doccat.DocumentSampleStream; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSSample; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.sentdetect.SentenceDetectorME; +import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.Span; +import opennlp.tools.util.TrainingParameters; + +public class OpenNLPTests { + + private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south."; + private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." }; + + @Test + public void givenText_WhenDetectSentences_ThenCountSentences(){ + InputStream is; + SentenceModel model; + try { + is = new FileInputStream("OpenNLP/en-sent.bin"); + model = new SentenceModel(is); + SentenceDetectorME sdetector = new SentenceDetectorME(model); + String sentences[] = sdetector.sentDetect(text); + assertEquals(4, sentences.length); + is.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + public void givenText_WhenDetectTokens_ThenVerifyNames(){ + InputStream is; + TokenNameFinderModel model; + try { + is = new FileInputStream("OpenNLP/en-ner-person.bin"); + model = new TokenNameFinderModel(is); + is.close(); + NameFinderME nameFinder = new NameFinderME(model); + Span nameSpans[] = nameFinder.find(sentence); + String[] names = Span.spansToStrings(nameSpans, sentence); + assertEquals(1, names.length); + assertEquals("James Jordan", names[0]); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + public void givenText_WhenDetectTokens_ThenVerifyLocations(){ + InputStream is; + TokenNameFinderModel model; + try { + is = new FileInputStream("OpenNLP/en-ner-location.bin"); + model = new TokenNameFinderModel(is); + is.close(); + NameFinderME nameFinder = new NameFinderME(model); + Span locationSpans[] = nameFinder.find(sentence); + String[] locations = Span.spansToStrings(locationSpans, sentence); + assertEquals(1, locations.length); + assertEquals("Oklahoma", locations[0]); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + public void givenText_WhenCategorizeDocument_ThenVerifyDocumentContent(){ + DoccatModel docCatModel; + try { + InputStreamFactory isf = new InputStreamFactory() { + public InputStream createInputStream() throws IOException { + return new FileInputStream("OpenNLP/doc-cat.train"); + } + }; + ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); + ObjectStream sampleStream = new DocumentSampleStream(lineStream); + DoccatFactory docCatFactory = new DoccatFactory(); + docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory); + DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel); + double[] outcomes = myCategorizer.categorize(sentence); + String category = myCategorizer.getBestCategory(outcomes); + assertEquals("GOOD", category); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + public void givenText_WhenTagDocument_ThenVerifyTaggedString(){ + try { + POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin")); + POSTaggerME posTaggerME = new POSTaggerME(posModel); + InputStreamFactory isf = new InputStreamFactory() { + public InputStream createInputStream() throws IOException { + return new FileInputStream("OpenNLP/PartOfSpeechTag.txt"); + } + }; + ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); + String line; + while ((line = lineStream.read()) != null) { + String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line); + String[] tags = posTaggerME.tag(whitespaceTokenizerLine); + POSSample posSample = new POSSample(whitespaceTokenizerLine, tags); + assertEquals("Out_IN of_IN the_DT night_NN that_WDT covers_VBZ me_PRP", posSample.toString()); + } + lineStream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + public void givenText_WhenChunked_ThenCountChunks(){ + try { + InputStream is = new FileInputStream("OpenNLP/en-chunker.bin"); + ChunkerModel cModel = new ChunkerModel(is); + ChunkerME chunkerME = new ChunkerME(cModel); + String pos[] = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN", "VBD"}; + String chunks[] = chunkerME.chunk(sentence, pos); + assertEquals(7, chunks.length); + } catch (IOException e) { + e.printStackTrace(); + } + } + +}