From 3e15bf9005d0c13584d0496b469a674e0415e1ac Mon Sep 17 00:00:00 2001 From: tallison Date: Thu, 4 Apr 2024 16:17:53 -0400 Subject: [PATCH] improve csv emitter and jdbc emitter (breaking changes in tika-snapshot) --- tika-gui-app/pom.xml | 58 +++++++++---------- tika-gui-app/src/main/java/module-info.java | 1 + .../tallison/tika/app/fx/ctx/AppContext.java | 8 +++ .../tika/app/fx/emitters/CSVEmitterSpec.java | 12 ++-- .../fx/emitters/JDBCEmitterController.java | 8 +-- .../tika/app/fx/emitters/JDBCEmitterSpec.java | 4 +- .../fx/emitters/OpenSearchEmitterSpec.java | 4 +- 7 files changed, 52 insertions(+), 43 deletions(-) diff --git a/tika-gui-app/pom.xml b/tika-gui-app/pom.xml index 9594a44..203880e 100644 --- a/tika-gui-app/pom.xml +++ b/tika-gui-app/pom.xml @@ -194,52 +194,52 @@ download-maven-plugin - tika-async-cli-3.0.0-20240404.081031-473 + tika-async-cli-3.0.0-20240404.161948-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.081031-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.161948-475.jar false ${project.build.directory}/lib/tika-core e46bf085b01462c429353e21b44aba1d - tika-fetcher-s3-3.0.0-20240404.081053-477 + tika-fetcher-s3-3.0.0-20240404.162001-479 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.081053-477.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.162001-479.jar false ${project.build.directory}/lib/tika-fetcher-s3 bbe5836d41044a0369d3d29384131836 - tika-parser-sqlite3-package-3.0.0-20240404.081120-490 + tika-parser-sqlite3-package-3.0.0-20240404.162011-492 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.081120-490.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.162011-492.jar false ${project.build.directory}/lib/tika-app 501126158285629af0dc3fbbe6843185 - tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473 + tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475.jar false ${project.build.directory}/lib/tika-core 6c7ec2128406dab85a859ca0aa5d4781 @@ -259,52 +259,52 @@ - tika-emitter-fs-3.0.0-20240404.081039-477 + tika-emitter-fs-3.0.0-20240404.161951-479 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.081039-477.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.161951-479.jar false ${project.build.directory}/lib/tika-emitter-fs e8be348519559c1a925f86d2120852cb - tika-emitter-jdbc-3.0.0-20240404.081041-473 + tika-emitter-jdbc-3.0.0-20240404.161952-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.081041-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.161952-475.jar false ${project.build.directory}/lib/tika-emitter-jdbc 90078d960cacf9bf16bdbf0cdd57e179 - tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473 + tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475.jar false ${project.build.directory}/lib/tika-core - 643daa6dfe8a1695735d271815e186ed + 48efffb7ca433f1c866d9838e117f4db - tika-eval-core-3.0.0-20240404.081048-472 + tika-eval-core-3.0.0-20240404.161957-474 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.081048-472.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.161957-474.jar false ${project.build.directory}/lib/tika-extras b458334817f28da268e07974d99b03b7 @@ -324,13 +324,13 @@ - tika-pipes-iterator-s3-3.0.0-20240404.081135-473 + tika-pipes-iterator-s3-3.0.0-20240404.162017-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.081135-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.162017-475.jar false ${project.build.directory}/lib/tika-pipes-iterator-s3 01b46c92d0d8f9f352172d49e88b893f @@ -350,65 +350,65 @@ - tika-serialization-3.0.0-20240404.081141-493 + tika-serialization-3.0.0-20240404.162019-495 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.081141-493.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.162019-495.jar false ${project.build.directory}/lib/tika-core 8cfb9502171c9462341f0e3e93a0e19e - tika-detector-siegfried-3.0.0-20240404.081036-471 + tika-detector-siegfried-3.0.0-20240404.161950-473 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.081036-471.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.161950-473.jar false ${project.build.directory}/lib/tika-extras f1d481f50f8e33997c559cc2eed82120 - tika-emitter-opensearch-3.0.0-20240404.081042-473 + tika-emitter-opensearch-3.0.0-20240404.161953-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.081042-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.161953-475.jar false ${project.build.directory}/lib/tika-emitter-opensearch c9ee40170c87a2234786d721255f5f96 - tika-emitter-s3-3.0.0-20240404.081043-477 + tika-emitter-s3-3.0.0-20240404.161953-479 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.081043-477.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.161953-479.jar false ${project.build.directory}/lib/tika-emitter-s3 577e03105df2137ad64b726606095d7e - tika-app-3.0.0-20240404.081029-473 + tika-app-3.0.0-20240404.161946-475 prepare-package wget - https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.081029-473.jar + https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.161946-475.jar false ${project.build.directory}/lib/tika-app f5f93da4a09b5f058ef697a2f9939f16 diff --git a/tika-gui-app/src/main/java/module-info.java b/tika-gui-app/src/main/java/module-info.java index bcd28d2..4a304a2 100644 --- a/tika-gui-app/src/main/java/module-info.java +++ b/tika-gui-app/src/main/java/module-info.java @@ -28,6 +28,7 @@ requires com.fasterxml.jackson.datatype.jsr310; requires com.fasterxml.jackson.datatype.jdk8; requires org.kordamp.ikonli.javafx; + requires com.h2database; exports org.tallison.tika.app.fx; diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/ctx/AppContext.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/ctx/AppContext.java index e5a6acd..39426d0 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/ctx/AppContext.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/ctx/AppContext.java @@ -63,11 +63,19 @@ public class AppContext { } static { + System.out.println(System.getProperties()); if (!StringUtils.isBlank(System.getProperty("TIKA_GUI_JAVA_HOME"))) { LOGGER.debug("setting TIKA_GUI_JAVA_HOME {}", System.getProperty("TIKA_GUI_JAVA_HOME")); TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("TIKA_GUI_JAVA_HOME")); } else if (!StringUtils.isBlank(System.getProperty("java.home"))) { TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("java.home")); + //TODO -- java_home should not include the bin directory. + //the "if" branch above is normally triggered through the .sh scripts, + //which incorrectly set java_home to java_home/bin + //Clean this up. + if (Files.isDirectory(TIKA_GUI_JAVA_HOME.resolve("bin"))) { + TIKA_GUI_JAVA_HOME = TIKA_GUI_JAVA_HOME.resolve("bin"); + } LOGGER.debug("setting TIKA_GUI_JAVA_HOME {} from java.home", System.getProperty("java.home")); } diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/CSVEmitterSpec.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/CSVEmitterSpec.java index 5978d3c..c522fe1 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/CSVEmitterSpec.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/CSVEmitterSpec.java @@ -94,7 +94,7 @@ private void createTable() throws SQLException { String dropTable = "drop table if exists " + tableName; StringBuilder createTable = new StringBuilder(); createTable.append("create table ").append(tableName); - createTable.append("( ").append(PATH_COL_NAME).append(" varchar(1024), "); + createTable.append("( ").append(ID_COLUMN_NAME).append(" varchar(1024), "); createTable.append(ATTACHMENT_NUM_COL_NAME).append(" int"); for (MetadataTuple t : getMetadataTuples()) { createTable.append(", ").append(t.getOutput()).append(" ").append(t.getProperty()); @@ -191,7 +191,7 @@ private Optional getCSVPath() { private void writeHeaders(CSVPrinter printer) throws IOException { List headers = new ArrayList<>(); - headers.add("path"); + headers.add("id"); headers.add("status"); headers.add("attachment_num"); if (getMetadataTuples().size() == 0) { @@ -208,22 +208,22 @@ private String getSelect() { String tikaTable = CSV_DB_TABLE_NAME; StringBuilder sb = new StringBuilder(); sb.append("select "); - sb.append("s.").append(PATH_COL_NAME).append(" as Path, s.status as Status, "); + sb.append("s.").append(ID_COLUMN_NAME).append(" as id, s.status as Status, "); sb.append("case when ").append(ATTACHMENT_NUM_COL_NAME).append(" is null then 0"); sb.append(" else ").append(ATTACHMENT_NUM_COL_NAME).append(" end"); for (MetadataTuple t : getMetadataTuples()) { sb.append(", "); String out = t.getOutput(); //if there's a column in tika_extracts - if (out.equals(PATH_COL_NAME) || out.equals("status")) { + if (out.equals(ID_COLUMN_NAME) || out.equals("status")) { sb.append("t."); } sb.append(t.getOutput()); } sb.append(" from tika_status s left join ").append(tikaTable) - .append(" t on s.path = t.path") - .append(" order by s.status, t.path asc, t.attachment_num asc"); + .append(" t on s.id = t.id") + .append(" order by s.status, t.id asc, t.attachment_num asc"); return sb.toString(); } diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterController.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterController.java index 011b524..e61e3ac 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterController.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterController.java @@ -17,7 +17,7 @@ package org.tallison.tika.app.fx.emitters; import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ATTACHMENT_NUM_COL_NAME; -import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.PATH_COL_NAME; +import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ID_COLUMN_NAME; import java.net.URL; import java.sql.Connection; @@ -333,9 +333,9 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException //TODO -- check column types! for (int i = 1; i <= metaData.getColumnCount(); i++) { if (i == 1) { - if (!PATH_COL_NAME.equalsIgnoreCase(metaData.getColumnName(i))) { + if (!ID_COLUMN_NAME.equalsIgnoreCase(metaData.getColumnName(i))) { alert(ALERT_TITLE, "Unexpected column name", - "First column should be: " + PATH_COL_NAME); + "First column should be: " + ID_COLUMN_NAME); return false; } } @@ -376,7 +376,7 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException private boolean tryToCreateTable() { StringBuilder sb = new StringBuilder(); sb.append("create table ").append(tableName.getText()).append(" ("); - sb.append(PATH_COL_NAME).append(" VARCHAR(1024),\n"); + sb.append(ID_COLUMN_NAME).append(" VARCHAR(1024),\n"); sb.append(ATTACHMENT_NUM_COL_NAME).append(" INTEGER"); for (MetadataRow r : getMetadataRows()) { sb.append(",\n"); diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterSpec.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterSpec.java index b8b157e..82eacbf 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterSpec.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterSpec.java @@ -39,7 +39,7 @@ public class JDBCEmitterSpec extends BaseEmitterSpec { private static final Logger LOGGER = LogManager.getLogger(JDBCEmitterSpec.class); - static String PATH_COL_NAME = "path"; + static String ID_COLUMN_NAME = "id"; static String ATTACHMENT_NUM_COL_NAME = "attachment_num"; @@ -108,7 +108,7 @@ public void setConnectionString(String connectionString) { void createAndSetInsertString(String tableName) { StringBuilder sb = new StringBuilder(); sb.append("insert into ").append(tableName).append(" ("); - sb.append(PATH_COL_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME); + sb.append(ID_COLUMN_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME); int colCount = 2; for (MetadataTuple t : getMetadataTuples()) { sb.append(", "); diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/OpenSearchEmitterSpec.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/OpenSearchEmitterSpec.java index 8cc5a23..5e7104f 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/OpenSearchEmitterSpec.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/OpenSearchEmitterSpec.java @@ -81,8 +81,8 @@ public void write(DomWriter writer, Element properties) { public Set getClassPathDependencies() { Set items = new HashSet<>(); items.add(ProcessUtils.escapeCommandLine( - AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch").toAbsolutePath() + - "/*")); + AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch") + .toAbsolutePath() + "/*")); return items; }