Skip to content

Commit 90d28d7

Browse files
authored
Merge pull request #103 from sebastian-nagel/duplicated-payload-metadata-actual-content-length
WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length"
2 parents b8a91bb + 6a3cf1b commit 90d28d7

File tree

6 files changed

+123
-8
lines changed

6 files changed

+123
-8
lines changed

src/main/java/org/archive/resource/arc/ARCResource.java

+2
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
6464
}
6565
}
6666

67+
@Override
6768
public InputStream getInputStream() {
6869
return new EOFNotifyingInputStream(digIS, this);
6970
}
7071

72+
@Override
7173
public void notifyEOF() throws IOException {
7274
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
7375
String digString = Base32.encode(digIS.getMessageDigest().digest());

src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java

+8-3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
3131
parser = new HttpHeaderParser();
3232
}
3333

34+
@Override
3435
public Resource getResource(InputStream is, MetaData parentMetaData,
3536
ResourceContainer container) throws ResourceParseException,
3637
IOException {
@@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
4041
if(headers.isCorrupt()) {
4142
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
4243
}
43-
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
44-
45-
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
44+
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
45+
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
46+
}
47+
long trailingSlopBytes = StreamCopy.readToEOF(is);
48+
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
49+
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
50+
}
4651
if(type != null) {
4752
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
4853
}

src/main/java/org/archive/resource/warc/WARCResource.java

+11-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
5353
countingIS = new CountingInputStream(
5454
ByteStreams.limit(response, length));
5555
} else {
56-
throw new ResourceParseException(null);
56+
throw new ResourceParseException(new Exception("Zero or negative length: " + length));
5757
}
5858
try {
5959
digIS = new DigestInputStream(countingIS,
@@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
6363
}
6464
}
6565

66+
@Override
6667
public InputStream getInputStream() {
6768
return new EOFNotifyingInputStream(digIS, this);
6869
}
6970

71+
@Override
7072
public void notifyEOF() throws IOException {
7173
String digString = Base32.encode(digIS.getMessageDigest().digest());
7274
if(container.isCompressed()) {
73-
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
75+
if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
76+
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
77+
}
7478
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
7579
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
7680
} else {
@@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
8185
(PushBackOneByteInputStream) raw;
8286
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
8387
if(numNewlines > 0) {
84-
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
88+
long payloadLength = countingIS.getCount();
89+
if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
90+
metaData.putLong(PAYLOAD_LENGTH, payloadLength);
91+
}
8592
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
8693
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
8794
}
8895
}
8996
}
9097
}
98+
9199
public MetaData getEnvelopeMetaData() {
92100
return envelope;
93101
}

src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java

+8-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
2121
parser = new HttpHeaderParser();
2222
}
2323

24+
@Override
2425
public Resource getResource(InputStream is, MetaData parentMetaData,
2526
ResourceContainer container) throws ResourceParseException,
2627
IOException {
@@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
3334
if(headers.isCorrupt()) {
3435
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
3536
}
36-
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
37-
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
37+
long trailingSlopBytes = StreamCopy.readToEOF(is);
38+
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
39+
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
40+
}
41+
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
42+
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
43+
}
3844
return new WARCMetaDataResource(md,container, headers);
3945

4046
} catch (HttpParseException e) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package org.archive.resource.arc;
2+
3+
4+
import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
5+
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
6+
7+
import java.io.IOException;
8+
9+
import org.archive.extract.ExtractingResourceFactoryMapper;
10+
import org.archive.extract.ExtractingResourceProducer;
11+
import org.archive.extract.ProducerUtils;
12+
import org.archive.extract.ResourceFactoryMapper;
13+
import org.archive.resource.Resource;
14+
import org.archive.resource.ResourceParseException;
15+
import org.archive.resource.ResourceProducer;
16+
import org.archive.util.StreamCopy;
17+
18+
import org.json.JSONObject;
19+
20+
import junit.framework.TestCase;
21+
22+
public class ARCResourceTest extends TestCase {
23+
24+
public void testARCResource() throws ResourceParseException, IOException {
25+
String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
26+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
27+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
28+
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
29+
30+
Resource resource = extractor.getNext();
31+
32+
while (resource != null) {
33+
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
34+
.getJSONObject("Payload-Metadata");
35+
System.err.println(payloadMD);
36+
37+
if (payloadMD.has(PAYLOAD_LENGTH)) {
38+
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
39+
}
40+
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
41+
// does not occur with the tested ARC file
42+
}
43+
44+
StreamCopy.readToEOF(resource.getInputStream());
45+
resource = extractor.getNext();
46+
}
47+
}
48+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package org.archive.resource.warc;
2+
3+
import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
4+
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
5+
6+
import java.io.IOException;
7+
8+
import org.archive.extract.ExtractingResourceFactoryMapper;
9+
import org.archive.extract.ExtractingResourceProducer;
10+
import org.archive.extract.ProducerUtils;
11+
import org.archive.extract.ResourceFactoryMapper;
12+
import org.archive.resource.Resource;
13+
import org.archive.resource.ResourceParseException;
14+
import org.archive.resource.ResourceProducer;
15+
import org.archive.util.StreamCopy;
16+
17+
import org.json.JSONObject;
18+
19+
import junit.framework.TestCase;
20+
21+
public class WARCResourceTest extends TestCase {
22+
23+
public void testWARCResource() throws ResourceParseException, IOException {
24+
String testFileName = "../../format/warc/IAH-urls-wget.warc";
25+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
26+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
27+
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
28+
29+
Resource resource = extractor.getNext();
30+
31+
while (resource != null) {
32+
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
33+
.getJSONObject("Payload-Metadata");
34+
35+
if (payloadMD.has(PAYLOAD_LENGTH)) {
36+
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
37+
}
38+
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
39+
assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
40+
}
41+
42+
StreamCopy.readToEOF(resource.getInputStream());
43+
resource = extractor.getNext();
44+
}
45+
}
46+
}

0 commit comments

Comments
 (0)