Skip to content

Commit

Permalink
improve csv emitter and jdbc emitter (breaking changes in tika-snapshot)
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Apr 4, 2024
1 parent e00a465 commit 3e15bf9
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 43 deletions.
58 changes: 29 additions & 29 deletions tika-gui-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -194,52 +194,52 @@
<artifactId>download-maven-plugin</artifactId>
<executions>
<execution>
<id>tika-async-cli-3.0.0-20240404.081031-473</id>
<id>tika-async-cli-3.0.0-20240404.161948-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.081031-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.161948-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
<md5>e46bf085b01462c429353e21b44aba1d</md5>
</configuration>
</execution>
<execution>
<id>tika-fetcher-s3-3.0.0-20240404.081053-477</id>
<id>tika-fetcher-s3-3.0.0-20240404.162001-479</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.081053-477.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.162001-479.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-fetcher-s3</outputDirectory>
<md5>bbe5836d41044a0369d3d29384131836</md5>
</configuration>
</execution>
<execution>
<id>tika-parser-sqlite3-package-3.0.0-20240404.081120-490</id>
<id>tika-parser-sqlite3-package-3.0.0-20240404.162011-492</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.081120-490.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.162011-492.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-app</outputDirectory>
<md5>501126158285629af0dc3fbbe6843185</md5>
</configuration>
</execution>
<execution>
<id>tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473</id>
<id>tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
<md5>6c7ec2128406dab85a859ca0aa5d4781</md5>
Expand All @@ -259,52 +259,52 @@
</configuration>
</execution>
<execution>
<id>tika-emitter-fs-3.0.0-20240404.081039-477</id>
<id>tika-emitter-fs-3.0.0-20240404.161951-479</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.081039-477.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.161951-479.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-emitter-fs</outputDirectory>
<md5>e8be348519559c1a925f86d2120852cb</md5>
</configuration>
</execution>
<execution>
<id>tika-emitter-jdbc-3.0.0-20240404.081041-473</id>
<id>tika-emitter-jdbc-3.0.0-20240404.161952-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.081041-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.161952-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-emitter-jdbc</outputDirectory>
<md5>90078d960cacf9bf16bdbf0cdd57e179</md5>
</configuration>
</execution>
<execution>
<id>tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473</id>
<id>tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
<md5>643daa6dfe8a1695735d271815e186ed</md5>
<md5>48efffb7ca433f1c866d9838e117f4db</md5>
</configuration>
</execution>
<execution>
<id>tika-eval-core-3.0.0-20240404.081048-472</id>
<id>tika-eval-core-3.0.0-20240404.161957-474</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.081048-472.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.161957-474.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-extras</outputDirectory>
<md5>b458334817f28da268e07974d99b03b7</md5>
Expand All @@ -324,13 +324,13 @@
</configuration>
</execution>
<execution>
<id>tika-pipes-iterator-s3-3.0.0-20240404.081135-473</id>
<id>tika-pipes-iterator-s3-3.0.0-20240404.162017-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.081135-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.162017-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-pipes-iterator-s3</outputDirectory>
<md5>01b46c92d0d8f9f352172d49e88b893f</md5>
Expand All @@ -350,65 +350,65 @@
</configuration>
</execution>
<execution>
<id>tika-serialization-3.0.0-20240404.081141-493</id>
<id>tika-serialization-3.0.0-20240404.162019-495</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.081141-493.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.162019-495.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
<md5>8cfb9502171c9462341f0e3e93a0e19e</md5>
</configuration>
</execution>
<execution>
<id>tika-detector-siegfried-3.0.0-20240404.081036-471</id>
<id>tika-detector-siegfried-3.0.0-20240404.161950-473</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.081036-471.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.161950-473.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-extras</outputDirectory>
<md5>f1d481f50f8e33997c559cc2eed82120</md5>
</configuration>
</execution>
<execution>
<id>tika-emitter-opensearch-3.0.0-20240404.081042-473</id>
<id>tika-emitter-opensearch-3.0.0-20240404.161953-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.081042-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.161953-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-emitter-opensearch</outputDirectory>
<md5>c9ee40170c87a2234786d721255f5f96</md5>
</configuration>
</execution>
<execution>
<id>tika-emitter-s3-3.0.0-20240404.081043-477</id>
<id>tika-emitter-s3-3.0.0-20240404.161953-479</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.081043-477.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.161953-479.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-emitter-s3</outputDirectory>
<md5>577e03105df2137ad64b726606095d7e</md5>
</configuration>
</execution>
<execution>
<id>tika-app-3.0.0-20240404.081029-473</id>
<id>tika-app-3.0.0-20240404.161946-475</id>
<phase>prepare-package</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.081029-473.jar</url>
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.161946-475.jar</url>
<unpack>false</unpack>
<outputDirectory>${project.build.directory}/lib/tika-app</outputDirectory>
<md5>f5f93da4a09b5f058ef697a2f9939f16</md5>
Expand Down
1 change: 1 addition & 0 deletions tika-gui-app/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
requires com.fasterxml.jackson.datatype.jsr310;
requires com.fasterxml.jackson.datatype.jdk8;
requires org.kordamp.ikonli.javafx;
requires com.h2database;

exports org.tallison.tika.app.fx;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,19 @@ public class AppContext {
}

static {
System.out.println(System.getProperties());
if (!StringUtils.isBlank(System.getProperty("TIKA_GUI_JAVA_HOME"))) {
LOGGER.debug("setting TIKA_GUI_JAVA_HOME {}", System.getProperty("TIKA_GUI_JAVA_HOME"));
TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("TIKA_GUI_JAVA_HOME"));
} else if (!StringUtils.isBlank(System.getProperty("java.home"))) {
TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("java.home"));
//TODO -- java_home should not include the bin directory.
//the "if" branch above is normally triggered through the .sh scripts,
//which incorrectly set java_home to java_home/bin
//Clean this up.
if (Files.isDirectory(TIKA_GUI_JAVA_HOME.resolve("bin"))) {
TIKA_GUI_JAVA_HOME = TIKA_GUI_JAVA_HOME.resolve("bin");
}
LOGGER.debug("setting TIKA_GUI_JAVA_HOME {} from java.home",
System.getProperty("java.home"));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ private void createTable() throws SQLException {
String dropTable = "drop table if exists " + tableName;
StringBuilder createTable = new StringBuilder();
createTable.append("create table ").append(tableName);
createTable.append("( ").append(PATH_COL_NAME).append(" varchar(1024), ");
createTable.append("( ").append(ID_COLUMN_NAME).append(" varchar(1024), ");
createTable.append(ATTACHMENT_NUM_COL_NAME).append(" int");
for (MetadataTuple t : getMetadataTuples()) {
createTable.append(", ").append(t.getOutput()).append(" ").append(t.getProperty());
Expand Down Expand Up @@ -191,7 +191,7 @@ private Optional<Path> getCSVPath() {

private void writeHeaders(CSVPrinter printer) throws IOException {
List<String> headers = new ArrayList<>();
headers.add("path");
headers.add("id");
headers.add("status");
headers.add("attachment_num");
if (getMetadataTuples().size() == 0) {
Expand All @@ -208,22 +208,22 @@ private String getSelect() {
String tikaTable = CSV_DB_TABLE_NAME;
StringBuilder sb = new StringBuilder();
sb.append("select ");
sb.append("s.").append(PATH_COL_NAME).append(" as Path, s.status as Status, ");
sb.append("s.").append(ID_COLUMN_NAME).append(" as id, s.status as Status, ");
sb.append("case when ").append(ATTACHMENT_NUM_COL_NAME).append(" is null then 0");
sb.append(" else ").append(ATTACHMENT_NUM_COL_NAME).append(" end");
for (MetadataTuple t : getMetadataTuples()) {
sb.append(", ");
String out = t.getOutput();
//if there's a column in tika_extracts
if (out.equals(PATH_COL_NAME) || out.equals("status")) {
if (out.equals(ID_COLUMN_NAME) || out.equals("status")) {
sb.append("t.");
}
sb.append(t.getOutput());
}

sb.append(" from tika_status s left join ").append(tikaTable)
.append(" t on s.path = t.path")
.append(" order by s.status, t.path asc, t.attachment_num asc");
.append(" t on s.id = t.id")
.append(" order by s.status, t.id asc, t.attachment_num asc");

return sb.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package org.tallison.tika.app.fx.emitters;

import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ATTACHMENT_NUM_COL_NAME;
import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.PATH_COL_NAME;
import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ID_COLUMN_NAME;

import java.net.URL;
import java.sql.Connection;
Expand Down Expand Up @@ -333,9 +333,9 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException
//TODO -- check column types!
for (int i = 1; i <= metaData.getColumnCount(); i++) {
if (i == 1) {
if (!PATH_COL_NAME.equalsIgnoreCase(metaData.getColumnName(i))) {
if (!ID_COLUMN_NAME.equalsIgnoreCase(metaData.getColumnName(i))) {
alert(ALERT_TITLE, "Unexpected column name",
"First column should be: " + PATH_COL_NAME);
"First column should be: " + ID_COLUMN_NAME);
return false;
}
}
Expand Down Expand Up @@ -376,7 +376,7 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException
private boolean tryToCreateTable() {
StringBuilder sb = new StringBuilder();
sb.append("create table ").append(tableName.getText()).append(" (");
sb.append(PATH_COL_NAME).append(" VARCHAR(1024),\n");
sb.append(ID_COLUMN_NAME).append(" VARCHAR(1024),\n");
sb.append(ATTACHMENT_NUM_COL_NAME).append(" INTEGER");
for (MetadataRow r : getMetadataRows()) {
sb.append(",\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class JDBCEmitterSpec extends BaseEmitterSpec {

private static final Logger LOGGER = LogManager.getLogger(JDBCEmitterSpec.class);

static String PATH_COL_NAME = "path";
static String ID_COLUMN_NAME = "id";

static String ATTACHMENT_NUM_COL_NAME = "attachment_num";

Expand Down Expand Up @@ -108,7 +108,7 @@ public void setConnectionString(String connectionString) {
void createAndSetInsertString(String tableName) {
StringBuilder sb = new StringBuilder();
sb.append("insert into ").append(tableName).append(" (");
sb.append(PATH_COL_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME);
sb.append(ID_COLUMN_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME);
int colCount = 2;
for (MetadataTuple t : getMetadataTuples()) {
sb.append(", ");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ public void write(DomWriter writer, Element properties) {
public Set<String> getClassPathDependencies() {
Set<String> items = new HashSet<>();
items.add(ProcessUtils.escapeCommandLine(
AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch").toAbsolutePath() +
"/*"));
AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch")
.toAbsolutePath() + "/*"));

return items;
}
Expand Down

0 comments on commit 3e15bf9

Please sign in to comment.