Skip to content

Commit e9c2edb

Browse files
tballisonNicholas DiPiazza
andauthored
Tika grpc mv take3 (#1854)
* TIKA-4181 - grpc server and client * latest updates - wip * code formatting * fix the issues with deps and such * add bidi streaming * clean up wording * make it as a stale connection expiring store * clean * delete dead code * add closeable * add closeable * add closeable * fixed issues related to proto lint * example docker file * example docker file and install docker file * fix the rm * more info * --platform linux/arm64 * --platform linux/arm64 * add work around comment for future * add comment * fix issues with files in wrong dir * add quotes * add a few more cleanups * add logging add reflection api for grpc so you can use grpc curl * fix issues and add tests for get, delete * push latest fixes * add start of a test scenario * add a fully functional http test case * add mtls as an option * add mtls in the example * add some robustness and add an exec * Fix issues with the fetch metadata * TIKA-4252: fix metadata issue * TIKA-4252: fix issue with config param serialization. * TIKA-4252: add error path * TIKA-4252: add protection against null metadata * TIKA-4252: fix merge conflicts from main * TIKA-4252: add json schema methods * TIKA-4252: fix broken tests, useless method * TIKA-4252: remove stupid sleep from test * TIKA-4252: fix checkstyle issue * TIKA-4252: log violations to console * TIKA-4252: log violations to console * TIKA-4252: exclude generated sources * TIKA-4252: if tika config is read-only, use a tmp file for tika server so that it can be modified * TIKA-4252: if tika config is read-only, use a tmp file for tika server so that it can be modified * add a health check * stop sending "metadata json" instead send a copy of the fetch configuration json that can add or override the main fetcher config * fix merge conflicts * fix issue with additional headers * move tika-grpc where it belongs inside the tika-server, not in the pipes project. * make params align with tika server and make them optional * merge TIKA-4181 and mv tika-grpc to higher level module * expand bounds in rtf parser test to deal with locally installed tesseract and exiftool --------- Co-authored-by: Nicholas DiPiazza <[email protected]>
1 parent e5a0265 commit e9c2edb

File tree

29 files changed

+120
-122
lines changed

29 files changed

+120
-122
lines changed

pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
<module>tika-batch</module>
4646
<module>tika-langdetect</module>
4747
<module>tika-pipes</module>
48+
<module>tika-grpc</module>
4849
<module>tika-app</module>
4950
<module>tika-server</module>
5051
<module>tika-integration-tests</module>
File renamed without changes.

tika-pipes/tika-grpc/pom.xml renamed to tika-grpc/pom.xml

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929

3030
<parent>
3131
<groupId>org.apache.tika</groupId>
32-
<artifactId>tika-pipes</artifactId>
32+
<artifactId>tika-parent</artifactId>
3333
<version>3.0.0-SNAPSHOT</version>
34-
<relativePath>../pom.xml</relativePath>
34+
<relativePath>../tika-parent/pom.xml</relativePath>
3535
</parent>
3636

3737
<properties>
@@ -353,6 +353,99 @@
353353
<mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
354354
</configuration>
355355
</plugin>
356+
<plugin>
357+
<artifactId>maven-shade-plugin</artifactId>
358+
<version>${maven.shade.version}</version>
359+
<executions>
360+
<execution>
361+
<phase>package</phase>
362+
<goals>
363+
<goal>shade</goal>
364+
</goals>
365+
<configuration>
366+
<createDependencyReducedPom>
367+
false
368+
</createDependencyReducedPom>
369+
<artifactSet>
370+
<excludes>
371+
</excludes>
372+
</artifactSet>
373+
<filters>
374+
<filter>
375+
<artifact>*:*</artifact>
376+
<excludes>
377+
<exclude>module-info.class</exclude>
378+
<exclude>META-INF/versions/9/module-info.class</exclude>
379+
<exclude>META-INF/*.SF</exclude>
380+
<exclude>META-INF/*.DSA</exclude>
381+
<exclude>META-INF/*.RSA</exclude>
382+
<exclude>META-INF/*.txt</exclude>
383+
<exclude>META-INF/ASL2.0</exclude>
384+
<exclude>META-INF/DEPENDENCIES</exclude>
385+
<exclude>META-INF/LICENSE</exclude>
386+
<exclude>META-INF/NOTICE</exclude>
387+
<exclude>META-INF/README</exclude>
388+
<exclude>META-INF/MANIFEST.MF</exclude>
389+
<exclude>LICENSE.txt</exclude>
390+
<exclude>NOTICE.txt</exclude>
391+
<exclude>CHANGES</exclude>
392+
<exclude>README</exclude>
393+
<exclude>builddef.lst</exclude>
394+
</excludes>
395+
</filter>
396+
</filters>
397+
<transformers>
398+
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
399+
<mainClass>org.apache.tika.pipes.grpc.TikaGrpcServer</mainClass>
400+
<manifestEntries>
401+
<Multi-Release>true</Multi-Release>
402+
</manifestEntries>
403+
</transformer>
404+
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
405+
<resource>META-INF/LICENSE</resource>
406+
<file>target/classes/META-INF/LICENSE</file>
407+
</transformer>
408+
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
409+
<resource>META-INF/NOTICE</resource>
410+
<file>target/classes/META-INF/NOTICE</file>
411+
</transformer>
412+
</transformers>
413+
</configuration>
414+
</execution>
415+
</executions>
416+
</plugin>
417+
<plugin>
418+
<groupId>org.apache.maven.plugins</groupId>
419+
<artifactId>maven-checkstyle-plugin</artifactId>
420+
<version>${checkstyle.plugin.version}</version>
421+
<dependencies>
422+
<dependency>
423+
<groupId>com.puppycrawl.tools</groupId>
424+
<artifactId>checkstyle</artifactId>
425+
<version>${puppycrawl.version}</version>
426+
</dependency>
427+
</dependencies>
428+
<executions>
429+
<execution>
430+
<id>validate</id>
431+
<phase>validate</phase>
432+
<configuration>
433+
<configLocation>checkstyle.xml</configLocation>
434+
<inputEncoding>UTF-8</inputEncoding>
435+
<consoleOutput>true</consoleOutput>
436+
<logViolationsToConsole>true</logViolationsToConsole>
437+
<includeTestSourceDirectory>true</includeTestSourceDirectory>
438+
<testSourceDirectories>${project.basedir}/src/test/java</testSourceDirectories>
439+
<violationSeverity>error</violationSeverity>
440+
<failOnViolation>true</failOnViolation>
441+
<excludeGeneratedSources>true</excludeGeneratedSources>
442+
</configuration>
443+
<goals>
444+
<goal>check</goal>
445+
</goals>
446+
</execution>
447+
</executions>
448+
</plugin>
356449
</plugins>
357450
</build>
358451
</project>

tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java renamed to tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import static io.grpc.health.v1.HealthCheckResponse.ServingStatus;
2020

2121
import java.io.File;
22+
import java.io.FileWriter;
23+
import java.nio.charset.StandardCharsets;
2224
import java.util.concurrent.TimeUnit;
2325

2426
import com.beust.jcommander.JCommander;
@@ -33,16 +35,20 @@
3335
import org.slf4j.Logger;
3436
import org.slf4j.LoggerFactory;
3537

38+
import org.apache.tika.config.TikaConfig;
39+
import org.apache.tika.config.TikaConfigSerializer;
40+
3641
/**
3742
* Server that manages startup/shutdown of the GRPC Tika server.
3843
*/
3944
public class TikaGrpcServer {
4045
private static final Logger LOGGER = LoggerFactory.getLogger(TikaGrpcServer.class);
46+
public static final int TIKA_SERVER_GRPC_DEFAULT_PORT = 50052;
4147
private Server server;
42-
@Parameter(names = {"-p", "--port"}, description = "The grpc server port", help = true, required = true)
43-
private Integer port;
48+
@Parameter(names = {"-p", "--port"}, description = "The grpc server port", help = true)
49+
private Integer port = TIKA_SERVER_GRPC_DEFAULT_PORT;
4450

45-
@Parameter(names = {"-t", "--tika-config"}, description = "The grpc server port", help = true, required = true)
51+
@Parameter(names = {"-c", "--config"}, description = "The grpc server port", help = true)
4652
private File tikaConfigXml;
4753

4854
@Parameter(names = {"-s", "--secure"}, description = "Enable credentials required to access this grpc server")
@@ -82,13 +88,20 @@ public void start() throws Exception {
8288
} else {
8389
creds = InsecureServerCredentials.create();
8490
}
91+
if (tikaConfigXml == null) {
92+
// Create a default tika config
93+
tikaConfigXml = File.createTempFile("tika-config", ".xml");
94+
try (FileWriter fw = new FileWriter(tikaConfigXml, StandardCharsets.UTF_8)) {
95+
TikaConfigSerializer.serialize(new TikaConfig(), TikaConfigSerializer.Mode.STATIC_FULL, fw, StandardCharsets.UTF_8);
96+
}
97+
}
8598
File tikaConfigFile = new File(tikaConfigXml.getAbsolutePath());
8699
healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING);
87100
server = Grpc
88101
.newServerBuilderForPort(port, creds)
89102
.addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath()))
90103
.addService(healthStatusManager.getHealthService())
91-
.addService(ProtoReflectionService.newInstance()) // Enable reflection
104+
.addService(ProtoReflectionService.newInstance())
92105
.build()
93106
.start();
94107
LOGGER.info("Server started, listening on " + port);

tika-pipes/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java renamed to tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@
6666
import org.apache.tika.TikaGrpc;
6767
import org.apache.tika.config.Initializable;
6868
import org.apache.tika.config.Param;
69-
import org.apache.tika.config.TikaConfigSerializer;
7069
import org.apache.tika.exception.TikaConfigException;
7170
import org.apache.tika.metadata.Metadata;
7271
import org.apache.tika.parser.ParseContext;
@@ -81,7 +80,7 @@
8180
import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer;
8281

8382
class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
84-
private static final Logger LOG = LoggerFactory.getLogger(TikaConfigSerializer.class);
83+
private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerImpl.class);
8584
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
8685
static {
8786
OBJECT_MAPPER.setSerializationInclusion(JsonInclude.Include.NON_NULL);
@@ -125,6 +124,10 @@ private void updateTikaConfig()
125124
DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath);
126125

127126
Element fetchersElement = (Element) tikaConfigDoc.getElementsByTagName("fetchers").item(0);
127+
if (fetchersElement == null) {
128+
fetchersElement = tikaConfigDoc.createElement("fetchers");
129+
tikaConfigDoc.getDocumentElement().appendChild(fetchersElement);
130+
}
128131
for (int i = 0; i < fetchersElement.getChildNodes().getLength(); ++i) {
129132
fetchersElement.removeChild(fetchersElement.getChildNodes().item(i));
130133
}

0 commit comments

Comments
 (0)