Skip to content

Commit

Permalink
Verify Content-Encoding when querying Content-Length (#1320)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkemnitz authored Aug 10, 2023
1 parent 2aa7589 commit 45a6aec
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 3 deletions.
6 changes: 4 additions & 2 deletions fsspec/implementations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ async def _ls_real(self, url, detail=True, **kwargs):
return list(sorted(out))

async def _ls(self, url, detail=True, **kwargs):

if self.use_listings_cache and url in self.dircache:
out = self.dircache[url]
else:
Expand Down Expand Up @@ -841,7 +840,10 @@ async def _file_info(url, session, size_policy="head", **kwargs):
# or 'Accept-Ranges': 'none' (not 'bytes')
# to mean streaming only, no random access => return None
if "Content-Length" in r.headers:
info["size"] = int(r.headers["Content-Length"])
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
if r.headers.get("Content-Encoding", "identity") == "identity":
info["size"] = int(r.headers["Content-Length"])
elif "Content-Range" in r.headers:
info["size"] = int(r.headers["Content-Range"].split("/")[1])

Expand Down
13 changes: 13 additions & 0 deletions fsspec/implementations/tests/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import fsspec.asyn
import fsspec.utils
from fsspec.implementations.http import HTTPStreamFile
from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401


Expand Down Expand Up @@ -280,6 +281,18 @@ def test_content_length_zero(server):
assert f.read() == data


def test_content_encoding_gzip(server):
h = fsspec.filesystem(
"http", headers={"give_length": "true", "gzip_encoding": "true"}
)
url = server + "/index/realfile"

with h.open(url, "rb") as f:
assert isinstance(f, HTTPStreamFile)
assert f.size is None
assert f.read() == data


def test_download(server, tmpdir):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
url = server + "/index/realfile"
Expand Down
14 changes: 13 additions & 1 deletion fsspec/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import gzip
import json
import os
import threading
Expand Down Expand Up @@ -76,7 +77,14 @@ def do_GET(self):
if "use_206" in self.headers:
status = 206
if "give_length" in self.headers:
response_headers = {"Content-Length": len(file_data)}
if "gzip_encoding" in self.headers:
file_data = gzip.compress(file_data)
response_headers = {
"Content-Length": len(file_data),
"Content-Encoding": "gzip",
}
else:
response_headers = {"Content-Length": len(file_data)}
self._respond(status, response_headers, file_data)
elif "give_range" in self.headers:
self._respond(status, {"Content-Range": content_range}, file_data)
Expand Down Expand Up @@ -123,6 +131,10 @@ def do_HEAD(self):
response_headers = {"Content-Length": len(file_data)}
if "zero_length" in self.headers:
response_headers["Content-Length"] = 0
elif "gzip_encoding" in self.headers:
file_data = gzip.compress(file_data)
response_headers["Content-Encoding"] = "gzip"
response_headers["Content-Length"] = len(file_data)

self._respond(200, response_headers)
elif "give_range" in self.headers:
Expand Down

0 comments on commit 45a6aec

Please sign in to comment.