Skip to content

Commit a3794ce

Browse files
committed
Use json response from Tika
Cleanup TestContainer Refactor ExtractionMetadata Add returnType to ExtractionRequest Remove static initializers
1 parent 11ea400 commit a3794ce

File tree

12 files changed

+304
-219
lines changed

12 files changed

+304
-219
lines changed

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public String name() {
2929

3030
@Override
3131
public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) {
32-
ExtractionMetadata metadata = new SimpleExtractionMetadata();
32+
ExtractionMetadata metadata = new ExtractionMetadata();
3333
metadata.add("Dummy-Backend", "true");
3434
metadata.add(
3535
"Content-Type",
@@ -43,7 +43,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
4343

4444
@Override
4545
public ExtractionResult extractOnly(
46-
InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) {
46+
InputStream inputStream, ExtractionRequest request, String xpathExpr) {
4747
if (xpathExpr != null) {
4848
throw new UnsupportedOperationException("XPath not supported by dummy backend");
4949
}

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ public void load(
102102

103103
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
104104
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
105+
String extractFormat =
106+
params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT);
105107

106108
// Parse optional passwords file into a map (keeps Tika usages out of this class)
107109
LinkedHashMap<Pattern, String> pwMap = null;
@@ -122,7 +124,8 @@ public void load(
122124
stream.getSourceInfo(),
123125
stream.getSize(),
124126
params.get(ExtractingParams.RESOURCE_PASSWORD, null),
125-
pwMap);
127+
pwMap,
128+
extractFormat);
126129

127130
boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
128131
String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
@@ -135,10 +138,8 @@ public void load(
135138
|| (passwordsFile != null);
136139

137140
if (extractOnly) {
138-
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
139141
try {
140-
ExtractionResult result =
141-
backend.extractOnly(inputStream, extractionRequest, extractFormat, xpathExpr);
142+
ExtractionResult result = backend.extractOnly(inputStream, extractionRequest, xpathExpr);
142143
// Write content
143144
rsp.add(stream.getName(), result.getContent());
144145
// Write metadata
@@ -165,7 +166,7 @@ public void load(
165166

166167
if (needLegacySax) {
167168
// Indexing with capture/xpath/etc: delegate SAX parse to backend
168-
SimpleExtractionMetadata neutral = new SimpleExtractionMetadata();
169+
ExtractionMetadata neutral = new ExtractionMetadata();
169170
SolrContentHandler handler =
170171
factory.createSolrContentHandler(neutral, params, req.getSchema());
171172
try {
@@ -194,8 +195,7 @@ public void load(
194195
log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
195196
// Index a document with literals only (no extracted content/metadata)
196197
SolrContentHandler handler =
197-
factory.createSolrContentHandler(
198-
new SimpleExtractionMetadata(), params, req.getSchema());
198+
factory.createSolrContentHandler(new ExtractionMetadata(), params, req.getSchema());
199199
addDoc(handler);
200200
return;
201201
}

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ public interface ExtractionBackend {
3232
* xpathExpr; if unsupported and xpathExpr is not null, they should throw
3333
* UnsupportedOperationException.
3434
*/
35-
ExtractionResult extractOnly(
36-
InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
35+
ExtractionResult extractOnly(InputStream inputStream, ExtractionRequest request, String xpathExpr)
3736
throws Exception;
3837

3938
/**

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,64 @@
1616
*/
1717
package org.apache.solr.handler.extraction;
1818

19-
/**
20-
* Neutral metadata container used by extraction backends. Provides minimal operations needed by
21-
* SolrContentHandler and response building without depending on Apache Tika's Metadata class.
22-
*/
23-
public interface ExtractionMetadata {
24-
void add(String name, String value);
19+
import java.util.ArrayList;
20+
import java.util.LinkedHashMap;
21+
import java.util.List;
22+
import java.util.Map;
23+
import java.util.Objects;
24+
25+
/** Simple metadata bean */
26+
public class ExtractionMetadata {
27+
private final Map<String, List<String>> map = new LinkedHashMap<>();
28+
29+
public void add(String name, String value) {
30+
if (name == null || value == null) return;
31+
map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
32+
}
33+
34+
public String[] getValues(String name) {
35+
List<String> vals = map.get(name);
36+
if (vals == null) return new String[0];
37+
return vals.toArray(new String[0]);
38+
}
39+
40+
public String get(String name) {
41+
List<String> vals = map.get(name);
42+
if (vals == null || vals.isEmpty()) return null;
43+
return vals.get(0);
44+
}
45+
46+
public String[] names() {
47+
return map.keySet().toArray(new String[0]);
48+
}
49+
50+
public void remove(String name) {
51+
map.remove(name);
52+
}
2553

26-
String[] getValues(String name);
54+
@Override
55+
public String toString() {
56+
StringBuilder sb = new StringBuilder("ExtractionMetadata{");
57+
boolean first = true;
58+
for (Map.Entry<String, List<String>> e : map.entrySet()) {
59+
if (!first) sb.append(", ");
60+
first = false;
61+
sb.append(e.getKey()).append('=').append(e.getValue());
62+
}
63+
sb.append('}');
64+
return sb.toString();
65+
}
2766

28-
String get(String name);
67+
@Override
68+
public boolean equals(Object obj) {
69+
if (this == obj) return true;
70+
if (!(obj instanceof ExtractionMetadata)) return false;
71+
ExtractionMetadata that = (ExtractionMetadata) obj;
72+
return Objects.equals(this.map, that.map);
73+
}
2974

30-
String[] names();
75+
@Override
76+
public int hashCode() {
77+
return Objects.hash(map);
78+
}
3179
}

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ public class ExtractionRequest {
2828
public final String resourcePassword; // optional password for encrypted docs
2929
public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
3030
passwordsMap; // optional passwords map
31+
public final String extractFormat;
3132

3233
public ExtractionRequest(
3334
String streamType,
@@ -38,7 +39,8 @@ public ExtractionRequest(
3839
String streamSourceInfo,
3940
Long streamSize,
4041
String resourcePassword,
41-
java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap) {
42+
java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap,
43+
String extractFormat) {
4244
this.streamType = streamType;
4345
this.resourceName = resourceName;
4446
this.contentType = contentType;
@@ -48,5 +50,6 @@ public ExtractionRequest(
4850
this.streamSize = streamSize;
4951
this.resourcePassword = resourcePassword;
5052
this.passwordsMap = passwordsMap;
53+
this.extractFormat = extractFormat;
5154
}
5255
}

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) {
144144
}
145145

146146
private static ExtractionMetadata copyToNeutral(Metadata md) {
147-
ExtractionMetadata out = new SimpleExtractionMetadata();
147+
ExtractionMetadata out = new ExtractionMetadata();
148148
for (String name : md.names()) {
149149
String[] vals = md.getValues(name);
150150
if (vals != null) for (String v : vals) out.add(name, v);
@@ -168,8 +168,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
168168

169169
@Override
170170
public ExtractionResult extractOnly(
171-
InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
172-
throws Exception {
171+
InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception {
173172
Parser parser = selectParser(request);
174173
if (parser == null) {
175174
throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
@@ -178,7 +177,7 @@ public ExtractionResult extractOnly(
178177
ParseContext context = buildContext(parser, request);
179178

180179
String content;
181-
if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractFormat) || xpathExpr != null) {
180+
if (ExtractingDocumentLoader.TEXT_FORMAT.equals(request.extractFormat) || xpathExpr != null) {
182181
org.apache.tika.sax.ToTextContentHandler textHandler =
183182
new org.apache.tika.sax.ToTextContentHandler();
184183
org.xml.sax.ContentHandler ch = textHandler;

solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java

Lines changed: 0 additions & 52 deletions
This file was deleted.

0 commit comments

Comments
 (0)