Introduction to OpenNLP (#2024)

* Introduction to OpenNLP * Introduction to OpenNLP
2017-06-08 16:54:35 -04:00
parent 40dc547558
commit b00ec6abfd
11 changed files with 343 additions and 1 deletions
@@ -0,0 +1,166 @@
+package com.baeldung.opennlp;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.logging.Logger;
+
+import opennlp.tools.chunker.ChunkerME;
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.doccat.DoccatFactory;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+public class OpenNLP {
+
+    private final static Logger LOGGER = Logger.getLogger(OpenNLP.class.getName());
+    private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south.";
+    private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." };
+
+    private DoccatModel docCatModel;
+
+    public static void main(String[] args) {
+        new OpenNLP();
+    }
+
+    public OpenNLP() {
+        try {
+            sentenceDetector();
+            tokenizer();
+            nameFinder();
+            locationFinder();
+            trainDocumentCategorizer();
+            documentCategorizer();
+            partOfSpeechTagger();
+            chunker();
+        } catch (InvalidFormatException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void sentenceDetector() throws InvalidFormatException, IOException {
+
+        InputStream is = new FileInputStream("OpenNLP/en-sent.bin");
+        SentenceModel model = new SentenceModel(is);
+        SentenceDetectorME sdetector = new SentenceDetectorME(model);
+        String sentences[] = sdetector.sentDetect(text);
+        Arrays.stream(sentences).forEach(LOGGER::info);
+        is.close();
+    }
+
+    public void tokenizer() throws InvalidFormatException, IOException {
+        InputStream is = new FileInputStream("OpenNLP/en-token.bin");
+        TokenizerModel model = new TokenizerModel(is);
+        Tokenizer tokenizer = new TokenizerME(model);
+        String tokens[] = tokenizer.tokenize(text);
+        Arrays.stream(tokens).forEach(LOGGER::info);
+        is.close();
+    }
+
+    public static void nameFinder() throws IOException {
+        InputStream is = new FileInputStream("OpenNLP/en-ner-person.bin");
+        TokenNameFinderModel model = new TokenNameFinderModel(is);
+        is.close();
+        NameFinderME nameFinder = new NameFinderME(model);
+        Span nameSpans[] = nameFinder.find(sentence);
+        String[] names = Span.spansToStrings(nameSpans, sentence);
+        Arrays.stream(names).forEach(LOGGER::info);
+    }
+
+    public static void locationFinder() throws IOException {
+        InputStream is = new FileInputStream("OpenNLP/en-ner-location.bin");
+        TokenNameFinderModel model = new TokenNameFinderModel(is);
+        is.close();
+        NameFinderME nameFinder = new NameFinderME(model);
+        Span locationSpans[] = nameFinder.find(sentence);
+        String[] locations = Span.spansToStrings(locationSpans, sentence);
+        Arrays.stream(locations).forEach(LOGGER::info);
+    }
+
+    public void trainDocumentCategorizer() {
+
+        try {
+            InputStreamFactory isf = new InputStreamFactory() {
+                public InputStream createInputStream() throws IOException {
+                    return new FileInputStream("OpenNLP/doc-cat.train");
+                }
+            };
+            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
+            ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+            DoccatFactory docCatFactory = new DoccatFactory();
+            docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void documentCategorizer() {
+        DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel);
+        double[] outcomes = myCategorizer.categorize(sentence);
+        String category = myCategorizer.getBestCategory(outcomes);
+
+        if (category.equalsIgnoreCase("GOOD")) {
+            LOGGER.info("Document is positive :) ");
+        } else {
+            LOGGER.info("Document is negative :( ");
+        }
+    }
+
+    public static void partOfSpeechTagger() throws IOException {
+        try {
+            POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin"));
+            POSTaggerME posTaggerME = new POSTaggerME(posModel);
+            InputStreamFactory isf = new InputStreamFactory() {
+                public InputStream createInputStream() throws IOException {
+                    return new FileInputStream("OpenNLP/PartOfSpeechTag.txt");
+                }
+            };
+            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
+            String line;
+            while ((line = lineStream.read()) != null) {
+                String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
+                String[] tags = posTaggerME.tag(whitespaceTokenizerLine);
+                POSSample posSample = new POSSample(whitespaceTokenizerLine, tags);
+                LOGGER.info(posSample.toString());
+            }
+            lineStream.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static void chunker() throws IOException {
+        InputStream is = new FileInputStream("OpenNLP/en-chunker.bin");
+        ChunkerModel cModel = new ChunkerModel(is);
+        ChunkerME chunkerME = new ChunkerME(cModel);
+        String[] taggedSentence = new String[] {"Out", "of", "the", "night", "that", "covers", "me"};
+        String pos[] = new String[] { "IN", "IN", "DT", "NN", "WDT", "VBZ", "PRP"};
+        String chunks[] = chunkerME.chunk(taggedSentence, pos);
+        Arrays.stream(chunks).forEach(LOGGER::info);
+    }
+
+}