From 45a6aec7da1407243f9767c6ab0cff40efee72eb Mon Sep 17 00:00:00 2001 From: Nico Kemnitz Date: Thu, 10 Aug 2023 17:45:00 +0200 Subject: [PATCH] Verify Content-Encoding when querying Content-Length (#1320) --- fsspec/implementations/http.py | 6 ++++-- fsspec/implementations/tests/test_http.py | 13 +++++++++++++ fsspec/tests/conftest.py | 14 +++++++++++++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index afd0c2664..5d118dcbd 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -195,7 +195,6 @@ async def _ls_real(self, url, detail=True, **kwargs): return list(sorted(out)) async def _ls(self, url, detail=True, **kwargs): - if self.use_listings_cache and url in self.dircache: out = self.dircache[url] else: @@ -841,7 +840,10 @@ async def _file_info(url, session, size_policy="head", **kwargs): # or 'Accept-Ranges': 'none' (not 'bytes') # to mean streaming only, no random access => return None if "Content-Length" in r.headers: - info["size"] = int(r.headers["Content-Length"]) + # Some servers may choose to ignore Accept-Encoding and return + # compressed content, in which case the returned size is unreliable. + if r.headers.get("Content-Encoding", "identity") == "identity": + info["size"] = int(r.headers["Content-Length"]) elif "Content-Range" in r.headers: info["size"] = int(r.headers["Content-Range"].split("/")[1]) diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index d8bd64524..fb3a55cad 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -10,6 +10,7 @@ import fsspec.asyn import fsspec.utils +from fsspec.implementations.http import HTTPStreamFile from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401 @@ -280,6 +281,18 @@ def test_content_length_zero(server): assert f.read() == data +def test_content_encoding_gzip(server): + h = fsspec.filesystem( + "http", headers={"give_length": "true", "gzip_encoding": "true"} + ) + url = server + "/index/realfile" + + with h.open(url, "rb") as f: + assert isinstance(f, HTTPStreamFile) + assert f.size is None + assert f.read() == data + + def test_download(server, tmpdir): h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "}) url = server + "/index/realfile" diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py index fdaf03335..9fdf25b7a 100644 --- a/fsspec/tests/conftest.py +++ b/fsspec/tests/conftest.py @@ -1,4 +1,5 @@ import contextlib +import gzip import json import os import threading @@ -76,7 +77,14 @@ def do_GET(self): if "use_206" in self.headers: status = 206 if "give_length" in self.headers: - response_headers = {"Content-Length": len(file_data)} + if "gzip_encoding" in self.headers: + file_data = gzip.compress(file_data) + response_headers = { + "Content-Length": len(file_data), + "Content-Encoding": "gzip", + } + else: + response_headers = {"Content-Length": len(file_data)} self._respond(status, response_headers, file_data) elif "give_range" in self.headers: self._respond(status, {"Content-Range": content_range}, file_data) @@ -123,6 +131,10 @@ def do_HEAD(self): response_headers = {"Content-Length": len(file_data)} if "zero_length" in self.headers: response_headers["Content-Length"] = 0 + elif "gzip_encoding" in self.headers: + file_data = gzip.compress(file_data) + response_headers["Content-Encoding"] = "gzip" + response_headers["Content-Length"] = len(file_data) self._respond(200, response_headers) elif "give_range" in self.headers: