Merge branch 'master' into bael-16656

This commit is contained in:
Josh Cummings
2019-10-26 15:37:05 -06:00
committed by GitHub
parent db85c8f275
commit 0be2175c89
20539 changed files with 1643630 additions and 0 deletions
+7
View File
@@ -0,0 +1,7 @@
## Apache Tika
This module contains articles about Apache Tika
### Relevant articles:
- [Content Analysis with Apache Tika](https://www.baeldung.com/apache-tika)
+26
View File
@@ -0,0 +1,26 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>apache-tika</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>apache-tika</name>
<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
</dependency>
</dependencies>
<properties>
<tika.version>1.17</tika.version>
</properties>
</project>
@@ -0,0 +1,67 @@
package com.baeldung.tika;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TikaAnalysis {
public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
Detector detector = new DefaultDetector();
Metadata metadata = new Metadata();
MediaType mediaType = detector.detect(stream, metadata);
return mediaType.toString();
}
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
Tika tika = new Tika();
String mediaType = tika.detect(stream);
return mediaType;
}
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
parser.parse(stream, handler, metadata, context);
return handler.toString();
}
public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
Tika tika = new Tika();
String content = tika.parseToString(stream);
return content;
}
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
parser.parse(stream, handler, metadata, context);
return metadata;
}
public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException {
Tika tika = new Tika();
Metadata metadata = new Metadata();
tika.parse(stream, metadata);
return metadata;
}
}
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>
@@ -0,0 +1,79 @@
package com.baeldung.tika;
import static org.hamcrest.CoreMatchers.containsString;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.junit.Test;
import org.xml.sax.SAXException;
public class TikaUnitTest {
@Test
public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream);
assertEquals("application/pdf", mediaType);
stream.close();
}
@Test
public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream);
assertEquals("application/pdf", mediaType);
stream.close();
}
@Test
public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
String content = TikaAnalysis.extractContentUsingParser(stream);
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
assertThat(content, containsString("detects and extracts metadata and text"));
stream.close();
}
@Test
public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
String content = TikaAnalysis.extractContentUsingFacade(stream);
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
assertThat(content, containsString("detects and extracts metadata and text"));
stream.close();
}
@Test
public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
assertEquals("Microsoft Office User", metadata.get("Author"));
stream.close();
}
@Test
public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
assertEquals("Microsoft Office User", metadata.get("Author"));
stream.close();
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.