diff --git a/.github/workflows/gradle-extraction-check.yml b/.github/workflows/gradle-extraction-check.yml new file mode 100644 index 00000000000..2b1a24c4cce --- /dev/null +++ b/.github/workflows/gradle-extraction-check.yml @@ -0,0 +1,25 @@ +name: Gradle Extraction Check + +on: + pull_request: + branches: + - '*' + +jobs: + test: + name: gradle extraction test + + runs-on: ubuntu-latest + timeout-minutes: 15 + + env: + DEVELOCITY_ACCESS_KEY: ${{ secrets.SOLR_DEVELOCITY_ACCESS_KEY }} + + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - uses: ./.github/actions/prepare-for-build + + - name: Run extraction module tests + run: ./gradlew --no-daemon solr:modules:extraction:check diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index a3783feaac1..d9525057058 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -194,6 +194,7 @@ squareup-okhttp3-okhttp = "4.12.0" stephenc-jcip = "1.0-1" swagger3 = "2.2.22" tdunning-tdigest = "3.3" +testcontainers = "1.20.4" thetaphi-forbiddenapis = "3.9" thisptr-jacksonjq = "0.0.13" threeten-bp = "1.6.8" @@ -512,6 +513,7 @@ stephenc-jcip-annotations = { module = "com.github.stephenc.jcip:jcip-annotation swagger3-annotations-jakarta = { module = "io.swagger.core.v3:swagger-annotations-jakarta", version.ref = "swagger3" } swagger3-jaxrs2-jakarta = { module = "io.swagger.core.v3:swagger-jaxrs2-jakarta", version.ref = "swagger3" } tdunning-tdigest = { module = "com.tdunning:t-digest", version.ref = "tdunning-tdigest" } +testcontainers = { module = "org.testcontainers:testcontainers", version.ref = "testcontainers" } thisptr-jacksonjq = { module = "net.thisptr:jackson-jq", version.ref = "thisptr-jacksonjq" } threeten-bp = { module = "org.threeten:threetenbp", version.ref = "threeten-bp" } xerces-impl = { module = "xerces:xercesImpl", version.ref = "xerces" } diff --git a/gradle/testing/randomization/policies/solr-tests.policy b/gradle/testing/randomization/policies/solr-tests.policy index 2d3246c6d9b..7eb635db831 100644 --- a/gradle/testing/randomization/policies/solr-tests.policy +++ b/gradle/testing/randomization/policies/solr-tests.policy @@ -31,6 +31,9 @@ grant { permission java.io.FilePermission "${java.io.tmpdir}", "read,write"; permission java.io.FilePermission "${java.io.tmpdir}${/}-", "read,write,delete"; + // Allow Testcontainers to read user-level configuration + permission java.io.FilePermission "${user.home}${/}.testcontainers.properties", "read"; + permission java.io.FilePermission "${tests.linedocsfile}", "read"; // DirectoryFactoryTest messes with these (wtf?) permission java.io.FilePermission "/tmp/inst1/conf/solrcore.properties", "read"; @@ -130,11 +133,11 @@ grant { permission javax.management.MBeanServerPermission "findMBeanServer"; permission javax.management.MBeanServerPermission "releaseMBeanServer"; permission javax.management.MBeanTrustPermission "register"; - + // needed by crossdc permission javax.security.auth.AuthPermission "getLoginConfiguration"; permission javax.security.auth.AuthPermission "setLoginConfiguration"; - + // needed by benchmark permission java.security.SecurityPermission "insertProvider"; @@ -206,7 +209,7 @@ grant { // additional permissions based on system properties set by /bin/solr // NOTE: if the property is not set, the permission entry is ignored. -grant { +grant { permission java.io.FilePermission "${solr.jetty.keystore}", "read,write,delete,readlink"; permission java.io.FilePermission "${solr.jetty.keystore}${/}-", "read,write,delete,readlink"; diff --git a/solr/licenses/docker-java-LICENSE-ASL.txt b/solr/licenses/docker-java-LICENSE-ASL.txt new file mode 100644 index 00000000000..492933f08c2 --- /dev/null +++ b/solr/licenses/docker-java-LICENSE-ASL.txt @@ -0,0 +1,176 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + +"Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/solr/licenses/docker-java-NOTICE.txt b/solr/licenses/docker-java-NOTICE.txt new file mode 100644 index 00000000000..49a9e022cce --- /dev/null +++ b/solr/licenses/docker-java-NOTICE.txt @@ -0,0 +1,7 @@ +This product includes software developed by the docker-java project. + +Copyright (c) 2013-2025, docker-java project contributors + +Project: https://github.com/docker-java/docker-java + +Licensed under the Apache License, Version 2.0. diff --git a/solr/licenses/docker-java-api-3.4.0.jar.sha1 b/solr/licenses/docker-java-api-3.4.0.jar.sha1 new file mode 100644 index 00000000000..bf5ca0d6db4 --- /dev/null +++ b/solr/licenses/docker-java-api-3.4.0.jar.sha1 @@ -0,0 +1 @@ +9ef23dcc93693f15e69b64632be096c38e31bc44 diff --git a/solr/licenses/docker-java-transport-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 new file mode 100644 index 00000000000..c1232d24a6b --- /dev/null +++ b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c058705684d782effc4b2edfdef1a87544ba4af8 diff --git a/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 new file mode 100644 index 00000000000..b658f8f0810 --- /dev/null +++ b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c4ce6d8695cfdb0027872f99cc20f8f679f8a969 diff --git a/solr/licenses/duct-tape-1.0.8.jar.sha1 b/solr/licenses/duct-tape-1.0.8.jar.sha1 new file mode 100644 index 00000000000..8ccb86d64ea --- /dev/null +++ b/solr/licenses/duct-tape-1.0.8.jar.sha1 @@ -0,0 +1 @@ +92edc22a9ab2f3e17c9bf700aaee377d50e8b530 diff --git a/solr/licenses/duct-tape-LICENSE-MIT.txt b/solr/licenses/duct-tape-LICENSE-MIT.txt new file mode 100644 index 00000000000..9cf106272ac --- /dev/null +++ b/solr/licenses/duct-tape-LICENSE-MIT.txt @@ -0,0 +1,19 @@ +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/licenses/jna-5.13.0.jar.sha1 b/solr/licenses/jna-5.13.0.jar.sha1 new file mode 100644 index 00000000000..93b456b9293 --- /dev/null +++ b/solr/licenses/jna-5.13.0.jar.sha1 @@ -0,0 +1 @@ +1200e7ebeedbe0d10062093f32925a912020e747 diff --git a/solr/licenses/testcontainers-1.20.4.jar.sha1 b/solr/licenses/testcontainers-1.20.4.jar.sha1 new file mode 100644 index 00000000000..29746a98e88 --- /dev/null +++ b/solr/licenses/testcontainers-1.20.4.jar.sha1 @@ -0,0 +1 @@ +ee2fe3afc9fa6cb2e6a43233998f3633f761692f diff --git a/solr/licenses/testcontainers-LICENSE-MIT.txt b/solr/licenses/testcontainers-LICENSE-MIT.txt new file mode 100644 index 00000000000..9cf106272ac --- /dev/null +++ b/solr/licenses/testcontainers-LICENSE-MIT.txt @@ -0,0 +1,19 @@ +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle index da6ebaccd68..66f3c2d0c00 100644 --- a/solr/modules/extraction/build.gradle +++ b/solr/modules/extraction/build.gradle @@ -19,6 +19,11 @@ apply plugin: 'java-library' description = 'Solr Integration with Tika for extracting content from binary file formats such as Microsoft Word and Adobe PDF' +ext { + // Disable security manager for extraction module tests + useSecurityManager = false +} + dependencies { implementation platform(project(':platform')) implementation project(':solr:core') @@ -35,11 +40,10 @@ dependencies { exclude group: 'org.quartz-scheduler', module: 'quartz' exclude group: 'xml-apis', module: 'xml-apis' }) - implementation (libs.xerces.impl, { - exclude group: 'xml-apis', module: 'xml-apis' - }) testImplementation project(':solr:test-framework') testImplementation libs.apache.lucene.testframework testImplementation libs.junit.junit + testImplementation libs.testcontainers + testImplementation libs.carrotsearch.randomizedtesting.runner } diff --git a/solr/modules/extraction/gradle.lockfile b/solr/modules/extraction/gradle.lockfile index abff70b0d7c..ef5da19b320 100644 --- a/solr/modules/extraction/gradle.lockfile +++ b/solr/modules/extraction/gradle.lockfile @@ -15,6 +15,9 @@ com.fasterxml.jackson.module:jackson-module-jakarta-xmlbind-annotations:2.20.0=j com.fasterxml.jackson:jackson-bom:2.20.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.fasterxml.woodstox:woodstox-core:7.0.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.github.ben-manes.caffeine:caffeine:3.2.2=annotationProcessor,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testRuntimeClasspath +com.github.docker-java:docker-java-api:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport-zerodep:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath com.github.jai-imageio:jai-imageio-core:1.4.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.junrar:junrar:7.5.3=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.kevinstern:software-and-algorithms:1.0=annotationProcessor,errorprone,testAnnotationProcessor @@ -99,7 +102,8 @@ javax.inject:javax.inject:1=annotationProcessor,errorprone,testAnnotationProcess javax.measure:unit-api:1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath joda-time:joda-time:2.14.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath junit:junit:4.13.2=jarValidation,testCompileClasspath,testRuntimeClasspath -net.java.dev.jna:jna:5.12.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +net.java.dev.jna:jna:5.12.1=compileClasspath,runtimeClasspath,runtimeLibs +net.java.dev.jna:jna:5.13.0=jarValidation,testCompileClasspath,testRuntimeClasspath net.sf.ehcache:ehcache-core:2.6.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.antlr:antlr4-runtime:4.13.2=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.apache.commons:commons-collections4:4.5.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath @@ -215,6 +219,7 @@ org.hamcrest:hamcrest:3.0=jarValidation,testCompileClasspath,testRuntimeClasspat org.itadaki:bzip2:0.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.javassist:javassist:3.30.2-GA=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.jdom:jdom2:2.0.6.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.jetbrains:annotations:26.0.2=jarValidation,testCompileClasspath,testRuntimeClasspath org.jspecify:jspecify:1.0.0=annotationProcessor,compileClasspath,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testCompileClasspath,testRuntimeClasspath org.junit.jupiter:junit-jupiter-api:5.6.2=jarValidation,testRuntimeClasspath org.junit.platform:junit-platform-commons:1.6.2=jarValidation,testRuntimeClasspath @@ -226,6 +231,7 @@ org.ow2.asm:asm-commons:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatf org.ow2.asm:asm-tree:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.ow2.asm:asm:9.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.pcollections:pcollections:4.0.1=annotationProcessor,errorprone,testAnnotationProcessor +org.rnorth.duct-tape:duct-tape:1.0.8=jarValidation,testCompileClasspath,testRuntimeClasspath org.semver4j:semver4j:6.0.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.slf4j:jcl-over-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.slf4j:jul-to-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath @@ -234,6 +240,7 @@ org.tallison.xmp:xmpcore-shaded:6.1.10=compileClasspath,jarValidation,runtimeCla org.tallison:isoparser:1.9.41.7=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:jmatio:1.5=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:metadata-extractor:2.17.1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.testcontainers:testcontainers:1.20.4=jarValidation,testCompileClasspath,testRuntimeClasspath org.tukaani:xz:1.9=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.xerial.snappy:snappy-java:1.1.10.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath xerces:xercesImpl:2.12.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java new file mode 100644 index 00000000000..cf42e72453b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import org.xml.sax.helpers.DefaultHandler; + +/** Dummy backend that emits predictable test data without actually parsing input content. */ +public class DummyExtractionBackend implements ExtractionBackend { + public static final String NAME = "dummy"; + private final String text = "This is dummy extracted content"; + + @Override + public String name() { + return NAME; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) { + ExtractionMetadata metadata = buildMetadataFromRequest(request); + metadata.add("Dummy-Backend", "true"); + metadata.add( + "Content-Type", + request.contentType != null ? request.contentType : "application/octet-stream"); + if (request.resourceName != null) { + metadata.add("resourcename", request.resourceName); + } + return new ExtractionResult(text, metadata); + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception { + + ExtractionResult res = extract(inputStream, request); + md.putAll(res.getMetadata().asMap()); + // Append the content to the SAX handler + saxContentHandler.characters(res.getContent().toCharArray(), 0, res.getContent().length()); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 014d56caae4..d07f25f3a0c 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -18,9 +18,11 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.lang.invoke.MethodHandles; -import java.util.Locale; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Pattern; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.UpdateParams; @@ -33,38 +35,11 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.HttpHeaders; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.PasswordProvider; -import org.apache.tika.parser.html.HtmlMapper; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.Matcher; -import org.apache.tika.sax.xpath.MatchingContentHandler; -import org.apache.tika.sax.xpath.XPathParser; -import org.apache.xml.serialize.BaseMarkupSerializer; -import org.apache.xml.serialize.OutputFormat; -import org.apache.xml.serialize.TextSerializer; -import org.apache.xml.serialize.XMLSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; -/** - * The class responsible for loading extracted content into Solr. - * - * @deprecated Will be replaced with something similar that calls out to a separate Tika Server - * process running in its own JVM. - */ -@Deprecated(since = "9.10.0") +/** The class responsible for loading extracted content into Solr. */ public class ExtractingDocumentLoader extends ContentStreamLoader { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -75,40 +50,34 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { /** Extract Only supported format. Default */ public static final String XML_FORMAT = "xml"; - /** XHTML XPath parser. */ - private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); - final SolrCore core; final SolrParams params; final UpdateRequestProcessor processor; final boolean ignoreTikaException; - protected AutoDetectParser autoDetectParser; + final boolean backCompat; private final AddUpdateCommand templateAdd; - protected TikaConfig config; - protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected ExtractionBackend backend; public ExtractingDocumentLoader( SolrQueryRequest req, UpdateRequestProcessor processor, - TikaConfig config, - ParseContextConfig parseContextConfig, - SolrContentHandlerFactory factory) { + SolrContentHandlerFactory factory, + ExtractionBackend backend) { this.params = req.getParams(); this.core = req.getCore(); - this.config = config; - this.parseContextConfig = parseContextConfig; this.processor = processor; + this.backCompat = params.getBool(ExtractingParams.BACK_COMPATIBILITY, true); templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); + templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); - // this is lightweight - autoDetectParser = new AutoDetectParser(config); this.factory = factory; + this.backend = backend; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); } @@ -131,169 +100,199 @@ public void load( ContentStream stream, UpdateRequestProcessor processor) throws Exception { - Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); - if (streamType != null) { - // Cache? Parsers are lightweight to construct and thread-safe, so I'm told - MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); - parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); - } else { - parser = autoDetectParser; - } - if (parser != null) { - Metadata metadata = new Metadata(); - - // If you specify the resource name (the filename, roughly) with this parameter, - // then Tika can make use of it in guessing the appropriate MIME type: - String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); - if (resourceName != null) { - metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); - } - // Provide stream's content type as hint for auto detection - if (stream.getContentType() != null) { - metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); - } + String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); + + try (InputStream inputStream = stream.getStream()) { + String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); + + String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); + boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); + boolean recursive = params.getBool(ExtractingParams.RECURSIVE, false); + String extractFormat = + params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT); - try (InputStream inputStream = stream.getStream()) { - metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); - metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); - metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); - metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); - // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata - String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); - if (charset != null) { - metadata.add(HttpHeaders.CONTENT_ENCODING, charset); + // Parse optional passwords file into a map (keeps Tika usages out of this class) + LinkedHashMap pwMap = null; + String passwordsFile = params.get("passwordsFile"); + if (passwordsFile != null) { + try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) { + pwMap = RegexRulesPasswordProvider.parseRulesFile(is); } + } - String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); - boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); - SolrContentHandler handler = - factory.createSolrContentHandler(metadata, params, req.getSchema()); - ContentHandler parsingHandler = handler; - - StringWriter writer = null; - BaseMarkupSerializer serializer = null; - if (extractOnly == true) { - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); - writer = new StringWriter(); - if (extractFormat.equals(TEXT_FORMAT)) { - serializer = new TextSerializer(); - serializer.setOutputCharStream(writer); - serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); - } else { - serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); - } - if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - serializer - .startDocument(); // The MatchingContentHandler does not invoke startDocument. See - // https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E - parsingHandler = new MatchingContentHandler(serializer, matcher); - } else { - parsingHandler = serializer; - } - } else if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - parsingHandler = new MatchingContentHandler(handler, matcher); - } // else leave it as is + ExtractionRequest extractionRequest = + new ExtractionRequest( + streamType, + resourceName, + stream.getContentType(), + charset, + stream.getName(), + stream.getSourceInfo(), + stream.getSize(), + params.get(ExtractingParams.RESOURCE_PASSWORD, null), + pwMap, + extractFormat, + recursive, + Collections.emptyMap()); + + boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); + String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); + boolean needLegacySax = + extractOnly + || xpathExpr != null + || captureAttr + || (captureElems != null && captureElems.length > 0) + || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null) + || (passwordsFile != null); + if (extractOnly) { try { - // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler - // for getting the document. - ParseContext context = parseContextConfig.create(); - - context.set(Parser.class, parser); - context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); - - // Password handling - RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); - String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); - if (pwMapFile != null && pwMapFile.length() > 0) { - InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); - if (is != null) { - log.debug("Password file supplied: {}", pwMapFile); - epp.parse(is); + ExtractionMetadata md = backend.buildMetadataFromRequest(extractionRequest); + String content; + if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat) + || xpathExpr != null) { + org.apache.tika.sax.ToTextContentHandler textHandler = + new org.apache.tika.sax.ToTextContentHandler(); + DefaultHandler ch = textHandler; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); + } + backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch); + content = textHandler.toString(); + } else { // XML format + org.apache.tika.sax.ToXMLContentHandler toXml = + new org.apache.tika.sax.ToXMLContentHandler(); + DefaultHandler ch = toXml; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); + } + backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch); + content = toXml.toString(); + if (!content.startsWith("\n" + content; } } - context.set(PasswordProvider.class, epp); - String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); - if (resourcePassword != null) { - epp.setExplicitPassword(resourcePassword); - log.debug("Literal password supplied for file {}", resourceName); + + appendBackCompatTikaMetadata(md); + + // Write content + rsp.add(stream.getName(), content); + // Write metadata + NamedList metadataNL = new NamedList<>(); + for (String name : md.names()) { + metadataNL.add(name, md.getValues(name)); } - parser.parse(inputStream, parsingHandler, metadata, context); - } catch (TikaException e) { + rsp.add(stream.getName() + "_metadata", metadataNL); + } catch (UnsupportedOperationException uoe) { + // For backends that don't support xpath + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "XPath filtering is not supported by backend '" + backend.name() + "'."); + } catch (Exception e) { if (ignoreTikaException) { - if (log.isWarnEnabled()) { - log.warn( - "skip extracting text due to {}. metadata={}", - e.getLocalizedMessage(), - metadata, - e); - } - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - if (extractOnly == false) { - addDoc(handler); - } else { - // serializer is not null, so we need to call endDoc on it if using xpath - if (xpathExpr != null) { - serializer.endDocument(); + return; + } + + if (needLegacySax) { + // Indexing with capture/xpath/etc: delegate SAX parse to backend + ExtractionMetadata metadata = backend.buildMetadataFromRequest(extractionRequest); + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); + try { + backend.extractWithSaxHandler(inputStream, extractionRequest, metadata, handler); + } catch (UnsupportedOperationException uoe) { + // For backends that don't support parseToSolrContentHandler + if (log.isWarnEnabled()) { + log.warn("skip extracting text since tika backend does not yet support this option"); } - rsp.add(stream.getName(), writer.toString()); - writer.close(); - String[] names = metadata.names(); - NamedList metadataNL = new NamedList<>(); - for (int i = 0; i < names.length; i++) { - String[] vals = metadata.getValues(names[i]); - metadataNL.add(names[i], vals); + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "The requested operation is not supported by backend '" + backend.name() + "'."); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) { + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; + } } - rsp.add(stream.getName() + "_metadata", metadataNL); + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + appendBackCompatTikaMetadata(handler.metadata); + + addDoc(handler); + return; + } + + // Default simple backend-neutral path + ExtractionResult result; + try { + result = backend.extract(inputStream, extractionRequest); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - } else { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "Stream type of " - + streamType - + " didn't match any known parsers. Please supply the " - + ExtractingParams.STREAM_TYPE - + " parameter."); + + ExtractionMetadata metadata = result.getMetadata(); + + appendBackCompatTikaMetadata(metadata); + + String content = result.getContent(); + + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); + handler.appendToContent(content); + addDoc(handler); } } - public static class MostlyPassthroughHtmlMapper implements HtmlMapper { - public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); - - /** - * Keep all elements and their content. - * - *

Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere - */ - @Override - public boolean isDiscardElement(String name) { - return false; - } + private final Map fieldMappings = new LinkedHashMap<>(); - /** Lowercases the attribute name */ - @Override - public String mapSafeAttribute(String elementName, String attributeName) { - return attributeName.toLowerCase(Locale.ENGLISH); + { + fieldMappings.put("dc:title", "title"); + fieldMappings.put("dc:creator", "author"); + fieldMappings.put("dc:description", "description"); + fieldMappings.put("dc:subject", "subject"); + fieldMappings.put("dc:language", "language"); + fieldMappings.put("dc:publisher", "publisher"); + fieldMappings.put("dcterms:created", "created"); + fieldMappings.put("dcterms:modified", "modified"); + fieldMappings.put("meta:author", "Author"); + fieldMappings.put("meta:creation-date", "Creation-Date"); + fieldMappings.put("meta:save-date", "Last-Save-Date"); + fieldMappings.put("meta:keyword", "Keywords"); + fieldMappings.put("pdf:docinfo:keywords", "Keywords"); + } + + private void appendBackCompatTikaMetadata(ExtractionMetadata md) { + if (!backCompat) { + return; } - /** - * Lowercases the element name, but returns null for <BR>, which suppresses the - * start-element event for lt;BR> tags. This also suppresses the <BODY> tags because - * those are handled internally by Tika's XHTMLContentHandler. - */ - @Override - public String mapSafeElement(String name) { - String lowerName = name.toLowerCase(Locale.ROOT); - return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + for (Map.Entry mapping : fieldMappings.entrySet()) { + String sourceField = mapping.getKey(); + String targetField = mapping.getValue(); + if (md.get(sourceField) != null && md.get(targetField) == null) { + md.addValues(targetField, md.getValues(sourceField)); + } } } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java index a7d159678f1..eb70d5b6f6d 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -136,4 +136,13 @@ public interface ExtractingParams { * .*=<defaultmypassword> at the end */ public static final String PASSWORD_MAP_FILE = "passwordsFile"; + + /** Backend selection parameter and */ + public static final String EXTRACTION_BACKEND = "extraction.backend"; + + /** Fix metadata to match Tika 1.x */ + public static final String BACK_COMPATIBILITY = "backCompatibility"; + + /** Enable recursive parsing of embedded documents */ + String RECURSIVE = "recursive"; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index c9a319bc0bb..09e2dddb0e0 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -16,8 +16,6 @@ */ package org.apache.solr.handler.extraction; -import java.io.InputStream; -import java.nio.file.Path; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.core.SolrCore; @@ -28,26 +26,24 @@ import org.apache.solr.security.PermissionNameProvider; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.util.plugin.SolrCoreAware; -import org.apache.tika.config.TikaConfig; /** * Handler for rich documents like PDF or Word or any other file format that Tika handles that need * the text to be extracted first from the document. - * - * @deprecated Will be replaced with something similar that calls out to a separate Tika Server - * process running in its own JVM. */ -@Deprecated(since = "9.10.0") public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware, PermissionNameProvider { public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; + public static final String TIKASERVER_URL = "tikaserver.url"; - protected TikaConfig config; + protected String tikaConfigLoc; protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected ExtractionBackendFactory backendFactory; + protected String defaultBackendName; @Override public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) { @@ -57,22 +53,8 @@ public PermissionNameProvider.Name getPermissionName(AuthorizationContext reques @Override public void inform(SolrCore core) { try { - String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); - if (tikaConfigLoc == null) { // default - ClassLoader classLoader = core.getResourceLoader().getClassLoader(); - try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { - config = new TikaConfig(is); - } - } else { - Path configFile = Path.of(tikaConfigLoc); - if (configFile.isAbsolute()) { - config = new TikaConfig(configFile); - } else { // in conf/ - try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { - config = new TikaConfig(is); - } - } - } + // Store tika config location (backend-specific) + this.tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG); if (parseContextConfigLoc == null) { // default: @@ -81,20 +63,37 @@ public void inform(SolrCore core) { parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc); } + + // Initialize backend factory once; backends are created lazily on demand + String tikaServerUrl = (String) initArgs.get(TIKASERVER_URL); + backendFactory = + new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl); + + // Choose default backend name (do not instantiate yet) + String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND); + defaultBackendName = + (backendName == null || backendName.trim().isEmpty()) + ? LocalTikaExtractionBackend.NAME + : backendName; + } catch (Exception e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika Config", e); + throw new SolrException( + ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e); } - factory = createFactory(); - } - - protected SolrContentHandlerFactory createFactory() { - return new SolrContentHandlerFactory(); + factory = new SolrContentHandlerFactory(); } @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory); + // Allow per-request override of backend via request param + String backendParam = req.getParams().get(ExtractingParams.EXTRACTION_BACKEND); + String nameToUse = + (backendParam != null && !backendParam.trim().isEmpty()) + ? backendParam + : defaultBackendName; + ExtractionBackend extractionBackend = backendFactory.getBackend(nameToUse); + return new ExtractingDocumentLoader(req, processor, factory, extractionBackend); } // ////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java new file mode 100644 index 00000000000..9d15b5a1159 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.xml.sax.helpers.DefaultHandler; + +/** Strategy interface for content extraction backends. */ +public interface ExtractionBackend { + /** + * Extract plain text and metadata from the inputStream. Implementations should not close the + * inputStream. This API is backend-neutral and does not expose SAX or XML-specific types. + */ + ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception; + + /** + * Perform extraction of text from input stream with SAX handler. Sax handler can be + * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler, MatchingContentHandler etc + */ + void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception; + + /** Build ExtractionMetadata from the request context */ + default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) { + ExtractionMetadata md = new ExtractionMetadata(); + md.addIfNotNull(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); + md.addIfNotNull(HttpHeaders.CONTENT_TYPE, request.contentType); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_NAME, request.streamName); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); + md.addIfNotNull(HttpHeaders.CONTENT_ENCODING, request.charset); + return md; + } + + /** A short name for debugging/config, e.g., "local" or "dummy". */ + String name(); +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java new file mode 100644 index 00000000000..7ee0c163152 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.solr.common.SolrException; +import org.apache.solr.core.SolrCore; + +/** + * Factory for ExtractionBackend instances. Lazily constructs backends by short name (e.g., "local", + * "dummy") and caches them for reuse. + */ +public class ExtractionBackendFactory { + private final SolrCore core; + private final String tikaConfigLoc; + private final ParseContextConfig parseContextConfig; + private final String tikaServerUrl; + private final Map cache = new ConcurrentHashMap<>(); + + public ExtractionBackendFactory( + SolrCore core, + String tikaConfigLoc, + ParseContextConfig parseContextConfig, + String tikaServerUrl) { + this.core = core; + this.tikaConfigLoc = tikaConfigLoc; + this.parseContextConfig = parseContextConfig; + this.tikaServerUrl = tikaServerUrl; + } + + /** Returns a backend instance for the given name, creating it if necessary. */ + public ExtractionBackend getBackend(String name) { + String key = normalize(name); + return cache.computeIfAbsent( + key, + k -> { + try { + return create(k); + } catch (Exception e) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Failed to create extraction backend '" + k + "'", + e); + } + }); + } + + private String normalize(String name) { + if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.NAME; + return name.trim().toLowerCase(Locale.ROOT); + } + + /** Creates a new backend instance for the given normalized name. */ + protected ExtractionBackend create(String normalizedName) throws Exception { + return switch (normalizedName) { + case DummyExtractionBackend.NAME -> new DummyExtractionBackend(); + case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend( + tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); + case LocalTikaExtractionBackend.NAME -> new LocalTikaExtractionBackend( + core, tikaConfigLoc, parseContextConfig); + default -> throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Unknown extraction backend: " + normalizedName); + }; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java new file mode 100644 index 00000000000..67592432fa0 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** Simple metadata bean */ +public class ExtractionMetadata { + private final Map> map = new LinkedHashMap<>(); + + public void add(String name, String value) { + if (name == null || value == null) return; + map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); + } + + public void addValues(String name, String[] values) { + if (name == null || values == null || values.length == 0) return; + map.computeIfAbsent(name, k -> new ArrayList<>()).addAll(List.of(values)); + } + + public void addIfNotNull(String resourceNameKey, String resourceName) { + if (resourceName != null) { + add(resourceNameKey, resourceName); + } + } + + public void putAll(Map> map) { + this.map.putAll(map); + } + + public String[] getValues(String name) { + List vals = map.get(name); + if (vals == null) return new String[0]; + return vals.toArray(new String[0]); + } + + public String get(String name) { + List vals = map.get(name); + if (vals == null || vals.isEmpty()) return null; + return vals.get(0); + } + + public String[] names() { + return map.keySet().toArray(new String[0]); + } + + public void remove(String name) { + map.remove(name); + } + + public Map> asMap() { + return map; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("ExtractionMetadata{"); + boolean first = true; + for (Map.Entry> e : map.entrySet()) { + if (!first) sb.append(", "); + first = false; + sb.append(e.getKey()).append('=').append(e.getValue()); + } + sb.append('}'); + return sb.toString(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!(obj instanceof ExtractionMetadata)) return false; + ExtractionMetadata that = (ExtractionMetadata) obj; + return Objects.equals(this.map, that.map); + } + + @Override + public int hashCode() { + return Objects.hash(map); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java new file mode 100644 index 00000000000..99ab4d8d742 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.HashMap; +import java.util.Map; + +/** Immutable request info needed by extraction backends. */ +public class ExtractionRequest { + public final String streamType; // explicit MIME type (optional) + public final String resourceName; // filename hint + public final String contentType; // HTTP content-type header + public final String charset; // derived charset if available + public final String streamName; + public final String streamSourceInfo; + public final Long streamSize; + public final String resourcePassword; // optional password for encrypted docs + public final java.util.LinkedHashMap + passwordsMap; // optional passwords map + public final String extractFormat; + public final boolean recursive; + public final Map tikaRequestHeaders = new HashMap<>(); + + public ExtractionRequest( + String streamType, + String resourceName, + String contentType, + String charset, + String streamName, + String streamSourceInfo, + Long streamSize, + String resourcePassword, + java.util.LinkedHashMap passwordsMap, + String extractFormat, + boolean recursive, + Map tikaRequestHeaders) { + this.streamType = streamType; + this.resourceName = resourceName; + this.contentType = contentType; + this.charset = charset; + this.streamName = streamName; + this.streamSourceInfo = streamSourceInfo; + this.streamSize = streamSize; + this.resourcePassword = resourcePassword; + this.passwordsMap = passwordsMap; + this.extractFormat = extractFormat; + this.recursive = recursive; + if (tikaRequestHeaders != null) { + this.tikaRequestHeaders.putAll(tikaRequestHeaders); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java new file mode 100644 index 00000000000..97767d15367 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** Immutable extraction result with plain text content and neutral metadata. */ +public final class ExtractionResult { + private final String content; + private final ExtractionMetadata metadata; + + public ExtractionResult(String content, ExtractionMetadata metadata) { + this.content = content == null ? "" : content; + this.metadata = metadata; + } + + /** Extracted textual content (plain text). */ + public String getContent() { + return content; + } + + /** Extracted metadata in neutral, backend-agnostic form. */ + public ExtractionMetadata getMetadata() { + return metadata; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java new file mode 100644 index 00000000000..8ad2adc47c0 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import java.nio.file.Path; +import java.util.Locale; +import org.apache.solr.core.SolrCore; +import org.apache.solr.logging.DeprecationLog; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct + * usage of Tika from the loader. + * + * @deprecated Will be removed soon, please use the 'tikaserver' extraction backend instead. + */ +@Deprecated(since = "9.10.0") +public class LocalTikaExtractionBackend implements ExtractionBackend { + private final TikaConfig tikaConfig; + private final ParseContextConfig parseContextConfig; + private final AutoDetectParser autoDetectParser; + + // Local HtmlMapper moved from ExtractingDocumentLoader + private static class MostlyPassthroughHtmlMapper implements HtmlMapper { + static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); + + @Override + public boolean isDiscardElement(String name) { + return false; + } + + @Override + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(java.util.Locale.ENGLISH); + } + + @Override + public String mapSafeElement(String name) { + String lowerName = name.toLowerCase(java.util.Locale.ROOT); + return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + } + } + + public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) { + this.tikaConfig = config; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(config); + } + + /** + * Construct backend by loading TikaConfig based on handler/core configuration without exposing + * Tika types to the handler. + */ + public LocalTikaExtractionBackend( + SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) throws Exception { + TikaConfig cfg; + if (tikaConfigLoc == null) { // default + ClassLoader classLoader = core.getResourceLoader().getClassLoader(); + try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { + cfg = new TikaConfig(is); + } + } else { + Path configFile = Path.of(tikaConfigLoc); + if (configFile.isAbsolute()) { + cfg = new TikaConfig(configFile); + } else { // in conf/ + try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { + cfg = new TikaConfig(is); + } + } + } + this.tikaConfig = cfg; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(cfg); + DeprecationLog.log("Local Tika", "The 'local' extraction backend is deprecated"); + } + + public static final String NAME = "local"; + + @Override + public String name() { + return NAME; + } + + private Parser selectParser(ExtractionRequest request) { + if (request.streamType != null) { + MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT)); + return new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt); + } + return autoDetectParser; + } + + private Metadata buildMetadata(ExtractionRequest request) { + ExtractionMetadata extractionMetadata = buildMetadataFromRequest(request); + Metadata md = new Metadata(); + for (String name : extractionMetadata.names()) { + String[] vals = extractionMetadata.getValues(name); + if (vals != null) for (String v : vals) md.add(name, v); + } + return md; + } + + private ParseContext buildContext(Parser parser, ExtractionRequest request) { + ParseContext context = parseContextConfig.create(); + context.set(Parser.class, parser); + context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); + PasswordProvider pwd = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null && pwd instanceof RegexRulesPasswordProvider) { + ((RegexRulesPasswordProvider) pwd).setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null && pwd instanceof RegexRulesPasswordProvider) { + ((RegexRulesPasswordProvider) pwd).setPasswordMap(request.passwordsMap); + } + context.set(PasswordProvider.class, pwd); + return context; + } + + private static ExtractionMetadata tikaMetadataToExtractionMetadata(Metadata md) { + ExtractionMetadata out = new ExtractionMetadata(); + for (String name : md.names()) { + String[] vals = md.getValues(name); + if (vals != null) for (String v : vals) out.add(name, v); + } + return out; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + ParseContext context = buildContext(parser, request); + Metadata md = buildMetadata(request); + BodyContentHandler textHandler = new BodyContentHandler(-1); + parser.parse(inputStream, textHandler, md, context); + return new ExtractionResult(textHandler.toString(), tikaMetadataToExtractionMetadata(md)); + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + ParseContext context = buildContext(parser, request); + Metadata tikaMetadata = buildMetadata(request); + parser.parse(inputStream, saxContentHandler, tikaMetadata, context); + for (String name : tikaMetadata.names()) { + String[] vals = tikaMetadata.getValues(name); + if (vals != null) for (String v : vals) md.add(name, v); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java index 84b4e94171c..8e7f876da83 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java @@ -55,6 +55,17 @@ public String getPassword(Metadata meta) { return null; } + public String getPassword(ExtractionMetadata extractionMetadata) { + if (getExplicitPassword() != null) { + return getExplicitPassword(); + } + + if (passwordMap.size() > 0) + return lookupPasswordFromMap(extractionMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); + + return null; + } + private String lookupPasswordFromMap(String fileName) { if (fileName != null && fileName.length() > 0) { for (Entry e : passwordMap.entrySet()) { diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 9edba0e925e..22be163c816 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -30,7 +30,7 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; -import org.apache.tika.metadata.Metadata; +// note: decoupled from Tika Metadata import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,7 +57,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara protected final SolrInputDocument document; - protected final Metadata metadata; + protected final ExtractionMetadata metadata; protected final SolrParams params; protected final StringBuilder catchAllBuilder = new StringBuilder(2048); protected final IndexSchema schema; @@ -74,7 +74,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara private Set literalFieldNames = null; - public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { + public SolrContentHandler(ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { this.document = new SolrInputDocument(); this.metadata = metadata; this.params = params; @@ -152,6 +152,13 @@ protected void addContent() { addField(contentFieldName, catchAllBuilder.toString(), null); } + /** Append pre-extracted plain text content to the catch-all builder. */ + public void appendToContent(String text) { + if (text != null && !text.isEmpty()) { + catchAllBuilder.append(text); + } + } + /** * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}. */ diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java index 1070e744d84..b4fe031a068 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java @@ -18,7 +18,6 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; -import org.apache.tika.metadata.Metadata; /** */ public class SolrContentHandlerFactory { @@ -26,7 +25,7 @@ public class SolrContentHandlerFactory { public SolrContentHandlerFactory() {} public SolrContentHandler createSolrContentHandler( - Metadata metadata, SolrParams params, IndexSchema schema) { + ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { return new SolrContentHandler(metadata, params, schema); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java new file mode 100644 index 00000000000..d2dbc5485a9 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import org.apache.solr.common.SolrException; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Extraction backend that delegates parsing to a remote Apache Tika Server. + * + *

This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports + * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing is not supported + * and will throw UnsupportedOperationException. + */ +public class TikaServerExtractionBackend implements ExtractionBackend { + private final HttpClient httpClient; + private final String baseUrl; // e.g., http://localhost:9998 + private final Duration timeout = Duration.ofSeconds(30); + private final TikaServerParser tikaServerResponseParser = new TikaServerParser(); + + public TikaServerExtractionBackend(String baseUrl) { + this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl); + } + + // Visible for tests + TikaServerExtractionBackend(HttpClient httpClient, String baseUrl) { + if (baseUrl.endsWith("/")) { + this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1); + } else { + this.baseUrl = baseUrl; + } + this.httpClient = httpClient; + } + + public static final String NAME = "tikaserver"; + + @Override + public String name() { + return NAME; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + ExtractionMetadata md = buildMetadataFromRequest(request); + BodyContentHandler textHandler = new BodyContentHandler(-1); + if (request.recursive) { + tikaServerResponseParser.parseRmetaJson(tikaResponse, textHandler, md); + } else { + tikaServerResponseParser.parseXml(tikaResponse, textHandler, md); + } + return new ExtractionResult(textHandler.toString(), md); + } + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception { + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + if (request.recursive) { + tikaServerResponseParser.parseRmetaJson(tikaResponse, saxContentHandler, md); + } else { + tikaServerResponseParser.parseXml(tikaResponse, saxContentHandler, md); + } + } + } + + private static String firstNonNull(String a, String b) { + return a != null ? a : b; + } + + /** + * Call the Tika Server to extract text and metadata. Depending on request.recursive, will either + * return XML (false) or JSON array (true) + * + * @return InputStream of the response body, either XML or json depending on request.recursive + */ + private InputStream callTikaServer(InputStream inputStream, ExtractionRequest request) + throws IOException, InterruptedException { + String url = baseUrl + (request.recursive ? "/rmeta" : "/tika"); + HttpRequest.Builder b = + HttpRequest.newBuilder(URI.create(url)) + .timeout(timeout) + .header("Accept", (request.recursive ? "application/json" : "text/xml")); + String contentType = firstNonNull(request.streamType, request.contentType); + if (contentType != null) { + b.header("Content-Type", contentType); + } + if (!request.tikaRequestHeaders.isEmpty()) { + request.tikaRequestHeaders.forEach(b::header); + } + ExtractionMetadata md = buildMetadataFromRequest(request); + if (request.resourcePassword != null || request.passwordsMap != null) { + RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null) { + passwordProvider.setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null) { + passwordProvider.setPasswordMap(request.passwordsMap); + } + + String pwd = passwordProvider.getPassword(md); + if (pwd != null) { + b.header("Password", pwd); + } + } + if (request.resourceName != null) { + b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); + } + b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); + + HttpResponse resp = + httpClient.send(b.build(), HttpResponse.BodyHandlers.ofInputStream()); + int code = resp.statusCode(); + if (code < 200 || code >= 300) { + throw new SolrException( + SolrException.ErrorCode.getErrorCode(code), + "TikaServer " + url + " returned status " + code); + } + return resp.body(); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java new file mode 100644 index 00000000000..26137c049c9 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.Utils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class TikaServerParser { + private final SAXParser saxParser; + + public TikaServerParser() { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + try { + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + } catch (Throwable ignore) { + // Some parsers may not support all features; ignore + } + try { + saxParser = factory.newSAXParser(); + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + + /** + * Parses response in XML format from Tika Server /tika endpoint. The result is that the metadata + * object is populated and the content handler is called with extracted text. + */ + public void parseXml(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata) + throws IOException, SAXException { + DefaultHandler myHandler = new TikaXmlResponseSaxContentHandler(handler, metadata); + InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream); + saxParser.parse(sanitizedStream, myHandler); + } + + // TODO: Warning, this method 100% AI generated, not reviewed + @SuppressWarnings({"unchecked", "rawtypes"}) + void parseRmetaJson(InputStream jsonStream, DefaultHandler handler, ExtractionMetadata md) + throws Exception { + Object parsed = Utils.fromJSON(jsonStream); + if (!(parsed instanceof List)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "Unexpected /rmeta response, expected JSON array"); + } + List list = (List) parsed; + for (Object o : list) { + if (!(o instanceof Map)) continue; + Map map = (Map) o; + // Copy metadata + for (Object k : map.keySet()) { + String key = String.valueOf(k); + Object val = map.get(k); + if ("X-TIKA:content".equalsIgnoreCase(key)) { + // handled below + continue; + } + if (val instanceof List) { + for (Object v : (List) val) { + if (v != null) md.add(key, String.valueOf(v)); + } + } else if (val != null) { + md.add(key, String.valueOf(val)); + } + } + Object content = map.get("X-TIKA:content"); + if (content != null) { + String xhtml = String.valueOf(content); + if (!xhtml.isEmpty() && handler != null) { + InputStream inputStream = + new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8)); + InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream); + saxParser.parse(sanitizedStream, handler); + } + } + } + } + + /** Custom SAX handler that will extract meta tags from the tika xml and delegate */ + static class TikaXmlResponseSaxContentHandler extends DefaultHandler { + private final ContentHandler delegate; + private final ExtractionMetadata metadata; + private boolean inHead = false; + + public TikaXmlResponseSaxContentHandler(ContentHandler delegate, ExtractionMetadata metadata) { + this.delegate = delegate; + this.metadata = metadata; + } + + @Override + public void startDocument() throws SAXException { + if (delegate != null) delegate.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + if (delegate != null) delegate.endDocument(); + } + + @Override + public void startElement( + String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = true; + } else if (inHead && "meta".equalsIgnoreCase(ln) && attributes != null) { + String name = attributes.getValue("name"); + String content = attributes.getValue("content"); + if (name != null && content != null) { + metadata.add(name, content); + } + } + if (delegate != null) delegate.startElement(uri, localName, qName, attributes); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = false; + } + if (delegate != null) delegate.endElement(uri, localName, qName); + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.characters(ch, start, length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.ignorableWhitespace(ch, start, length); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + if (delegate != null) delegate.startPrefixMapping(prefix, uri); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + if (delegate != null) delegate.endPrefixMapping(prefix); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java new file mode 100644 index 00000000000..5c211df155b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.io.Reader; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +/** + * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML + * parser. TODO: Warning: Most of this class is AI generated. Can be a lot smaller, only sanitize + * '0x0'? + */ +final class XmlSanitizingReader extends java.io.Reader { + private final java.io.Reader in; + private final StringBuilder entityBuf = new StringBuilder(); + private boolean inEntity = false; // after reading '&' + + // For surrogate tracking to evaluate XML validity by code point + private int pendingHighSurrogate = -1; + + public XmlSanitizingReader(java.io.Reader in) { + this.in = in; + } + + @Override + public int read(char[] cbuf, int off, int len) throws java.io.IOException { + int written = 0; + while (written < len) { + int ci = in.read(); + if (ci == -1) break; + char ch = (char) ci; + + // Handle numeric entity stripping for � and � variants + if (!inEntity) { + if (ch == '&') { + inEntity = true; + entityBuf.setLength(0); + entityBuf.append(ch); + continue; // don't write yet + } + } else { + entityBuf.append(ch); + // stop conditions for entity buffering + if (ch == ';' || entityBuf.length() > 12) { // entities are short; cap length defensively + String ent = entityBuf.toString(); + boolean drop = isNullNumericEntity(ent); + inEntity = false; + if (!drop) { + // flush buffered entity to output + for (int i = 0; i < ent.length() && written < len; i++) { + cbuf[off + written++] = ent.charAt(i); + } + } + continue; + } + // Keep buffering alphanumerics and '#', 'x' + continue; + } + + // Filter invalid XML 1.0 characters by code point + if (Character.isHighSurrogate(ch)) { + pendingHighSurrogate = ch; + continue; // need next char to form code point + } + if (Character.isLowSurrogate(ch) && pendingHighSurrogate != -1) { + int cp = Character.toCodePoint((char) pendingHighSurrogate, ch); + pendingHighSurrogate = -1; + if (isAllowedXmlChar(cp)) { + // encode back as surrogate pair + cbuf[off + written++] = Character.highSurrogate(cp); + if (written < len) { + cbuf[off + written++] = Character.lowSurrogate(cp); + } else { + // If no space for low surrogate, keep it pending (edge, unlikely with reasonable len) + // Fallback: buffer low surrogate into a small one-char pushback by using a field + // For simplicity, write only if space available; otherwise, return and next read + // continues + // But to avoid corruption, store it + pushbackChar = Character.lowSurrogate(cp); + } + } + continue; + } else { + // previous high surrogate without low surrogate -> invalid; drop it + pendingHighSurrogate = -1; + } + + int cp = ch; + if (!Character.isSurrogate(ch) && isAllowedXmlChar(cp)) { + cbuf[off + written++] = ch; + } + } + return (written == 0) ? -1 : written; + } + + private Character pushbackChar = null; + + @Override + public boolean ready() throws java.io.IOException { + return in.ready(); + } + + @Override + public void close() throws java.io.IOException { + in.close(); + } + + private static boolean isNullNumericEntity(String ent) { + // Accept patterns like '�', '�', '�', '�' (case-insensitive) + if (ent == null) return false; + if (!ent.startsWith("&#") || !ent.endsWith(";")) return false; + String mid = ent.substring(2, ent.length() - 1); + if (mid.isEmpty()) return false; + if (mid.charAt(0) == 'x' || mid.charAt(0) == 'X') { + // hex + for (int i = 1; i < mid.length(); i++) { + char c = mid.charAt(i); + if (c != '0') return false; + } + return mid.length() > 1; // at least one zero after x + } else { + // decimal + for (int i = 0; i < mid.length(); i++) { + char c = mid.charAt(i); + if (c != '0') return false; + } + return true; // one or more zeros + } + } + + private static boolean isAllowedXmlChar(int cp) { + return cp == 0x9 + || cp == 0xA + || cp == 0xD + || (cp >= 0x20 && cp <= 0xD7FF) + || (cp >= 0xE000 && cp <= 0xFFFD) + || (cp >= 0x10000 && cp <= 0x10FFFF); + } + + public static InputStream sanitize(InputStream in) throws IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream pipedIn = new PipedInputStream(out); + + Reader reader = new XmlSanitizingReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8); + + Thread worker = + new Thread( + () -> { + try (reader; + writer) { + reader.transferTo(writer); + } catch (IOException e) { + try { + pipedIn.close(); + } catch (IOException ignored) { + } + } + }, + "XmlSanitizingReaderWorker"); + worker.setDaemon(true); + worker.start(); + + return pipedIn; + } +} diff --git a/solr/modules/extraction/src/test-files/extraction/example.html b/solr/modules/extraction/src/test-files/extraction/example.html index 5732f6214bc..2801c3c97d8 100644 --- a/solr/modules/extraction/src/test-files/extraction/example.html +++ b/solr/modules/extraction/src/test-files/extraction/example.html @@ -6,8 +6,8 @@

Here is some text

-
Here is some text in a div
-
This has a link.
+

a h1 tag

+

This has a link in a paragraph.

News
  • diff --git a/solr/modules/extraction/src/test-files/extraction/simple.html b/solr/modules/extraction/src/test-files/extraction/simple.html index 3c807fb1d98..3ec4d4e0d01 100644 --- a/solr/modules/extraction/src/test-files/extraction/simple.html +++ b/solr/modules/extraction/src/test-files/extraction/simple.html @@ -10,7 +10,7 @@ Here is some text

    distinct
    words

    -
    Here is some text in a div
    +

    Here is some text in a h1

    This has a link.