Skip to content

Commit e9974db

Browse files
committed
Added extra_headers parameter; Added compressed data support
1 parent 80cf6fe commit e9974db

File tree

3 files changed

+183
-70
lines changed

3 files changed

+183
-70
lines changed

README.md

+14
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,20 @@ TABLES
268268
GoodBye!
269269
```
270270

271+
272+
# Compressed Data
273+
274+
Consider using the new "extra_headers" optional parameter to send
275+
"Accept-Encoding: gzip" and have Druid return the results compressed,
276+
increasing the performance of the query especially for large data sets.
277+
278+
```python
279+
from pydruid.client import PyDruid
280+
281+
query = PyDruid(druid_url_goes_here, 'druid/v2', extra_headers={"Accept-Encoding": "gzip"})
282+
```
283+
284+
271285
# Contributing
272286

273287
Contributions are welcomed of course. We like to use `black` and `flake8`.

pydruid/client.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from __future__ import division
1717
from __future__ import absolute_import
1818

19+
import sys
1920
import json
2021
import re
2122

@@ -24,19 +25,35 @@
2425
from pydruid.query import QueryBuilder
2526
from base64 import b64encode
2627

28+
if sys.version_info.major == 2 and sys.version_info.minor == 7:
29+
import StringIO
30+
from gzip import GzipFile
31+
32+
def decompress(data):
33+
infile = StringIO.StringIO()
34+
infile.write(data)
35+
with GzipFile(fileobj=infile, mode="r") as f:
36+
f.rewind()
37+
ud = f.read()
38+
return ud
39+
40+
41+
else:
42+
from gzip import decompress
2743

2844
# extract error from the <PRE> tag inside the HTML response
2945
HTML_ERROR = re.compile("<pre>\\s*(.*?)\\s*</pre>", re.IGNORECASE)
3046

3147

3248
class BaseDruidClient(object):
33-
def __init__(self, url, endpoint):
49+
def __init__(self, url, endpoint, extra_headers=None):
3450
self.url = url
3551
self.endpoint = endpoint
3652
self.query_builder = QueryBuilder()
3753
self.username = None
3854
self.password = None
3955
self.proxies = None
56+
self.extra_headers = extra_headers
4057

4158
def set_basic_auth_credentials(self, username, password):
4259
self.username = username
@@ -55,6 +72,8 @@ def _prepare_url_headers_and_body(self, query):
5572
else:
5673
url = self.url + "/" + self.endpoint
5774
headers = {"Content-Type": "application/json"}
75+
if self.extra_headers and isinstance(self.extra_headers, dict):
76+
headers.update(self.extra_headers)
5877
if (self.username is not None) and (self.password is not None):
5978
authstring = "{}:{}".format(self.username, self.password)
6079
b64string = b64encode(authstring.encode()).decode()
@@ -542,15 +561,23 @@ class PyDruid(BaseDruidClient):
542561
1 6 2013-10-04T00:00:00.000Z user_2
543562
"""
544563

545-
def __init__(self, url, endpoint):
546-
super(PyDruid, self).__init__(url, endpoint)
564+
def __init__(self, url, endpoint, extra_headers=None):
565+
super(PyDruid, self).__init__(url, endpoint, extra_headers)
547566

548567
def _post(self, query):
549568
try:
550569
headers, querystr, url = self._prepare_url_headers_and_body(query)
551570
req = urllib.request.Request(url, querystr, headers)
552571
res = urllib.request.urlopen(req)
553-
data = res.read().decode("utf-8")
572+
content_encoding = res.info().get("Content-Encoding")
573+
if content_encoding == "gzip":
574+
data = decompress(res.read()).decode("utf-8")
575+
elif content_encoding:
576+
raise ValueError(
577+
"Invalid content encoding: {}".format(content_encoding)
578+
)
579+
else:
580+
data = res.read().decode("utf-8")
554581
res.close()
555582
except urllib.error.HTTPError as e:
556583
err = e.read()

tests/test_client.py

+138-66
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: UTF-8 -*-
22
import textwrap
3-
3+
import sys
44
import pytest
55
from mock import patch, Mock
66
from six.moves import urllib
@@ -11,29 +11,38 @@
1111
from pydruid.utils.aggregators import doublesum
1212
from pydruid.utils.filters import Dimension
1313

14+
if sys.version_info.major == 2 and sys.version_info.minor == 7:
15+
from gzip import GzipFile
16+
17+
def compress(data):
18+
out = StringIO()
19+
with GzipFile(fileobj=out, mode="w") as f:
20+
f.write(data)
21+
return out.getvalue()
22+
23+
24+
else:
25+
from gzip import compress
1426

15-
def create_client():
16-
return PyDruid("http://localhost:8083", "druid/v2/")
27+
28+
def create_client(headers=None):
29+
return PyDruid("http://localhost:8083", "druid/v2/", headers)
1730

1831

1932
def create_blank_query():
20-
return Query({}, 'none')
33+
return Query({}, "none")
2134

2235

23-
def _http_error(code, msg, data = ''):
36+
def _http_error(code, msg, data=""):
2437
# Need a file-like object for the response data
2538
fp = StringIO(data)
2639
return urllib.error.HTTPError(
27-
url='http://fakeurl:8080/druid/v2/',
28-
hdrs={},
29-
code=code,
30-
msg=msg,
31-
fp=fp,
40+
url="http://fakeurl:8080/druid/v2/", hdrs={}, code=code, msg=msg, fp=fp
3241
)
3342

3443

3544
class TestPyDruid:
36-
@patch('pydruid.client.urllib.request.urlopen')
45+
@patch("pydruid.client.urllib.request.urlopen")
3746
def test_druid_returns_error(self, mock_urlopen):
3847
# given
3948
mock_urlopen.side_effect = _http_error(500, "Druid error")
@@ -42,20 +51,22 @@ def test_druid_returns_error(self, mock_urlopen):
4251
# when / then
4352
with pytest.raises(IOError):
4453
client.topn(
45-
datasource="testdatasource",
46-
granularity="all",
47-
intervals="2015-12-29/pt1h",
48-
aggregations={"count": doublesum("count")},
49-
dimension="user_name",
50-
metric="count",
51-
filter=Dimension("user_lang") == "en",
52-
threshold=1,
53-
context={"timeout": 1000})
54-
55-
@patch('pydruid.client.urllib.request.urlopen')
54+
datasource="testdatasource",
55+
granularity="all",
56+
intervals="2015-12-29/pt1h",
57+
aggregations={"count": doublesum("count")},
58+
dimension="user_name",
59+
metric="count",
60+
filter=Dimension("user_lang") == "en",
61+
threshold=1,
62+
context={"timeout": 1000},
63+
)
64+
65+
@patch("pydruid.client.urllib.request.urlopen")
5666
def test_druid_returns_html_error(self, mock_urlopen):
5767
# given
58-
message = textwrap.dedent("""
68+
message = textwrap.dedent(
69+
"""
5970
<html>
6071
<head>
6172
<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1"/>
@@ -68,26 +79,31 @@ def test_druid_returns_html_error(self, mock_urlopen):
6879
<hr /><a href="http://eclipse.org/jetty">Powered by Jetty:// 9.3.19.v20170502</a><hr/>
6980
</body>
7081
</html>
71-
""").strip()
72-
mock_urlopen.side_effect = _http_error(500, 'Internal Server Error', message)
82+
"""
83+
).strip()
84+
mock_urlopen.side_effect = _http_error(500, "Internal Server Error", message)
7385
client = create_client()
7486

7587
# when / then
7688
with pytest.raises(IOError) as e:
7789
client.topn(
78-
datasource="testdatasource",
79-
granularity="all",
80-
intervals="2015-12-29/pt1h",
81-
aggregations={"count": doublesum("count")},
82-
dimension="user_name",
83-
metric="count",
84-
filter=Dimension("user_lang") == "en",
85-
threshold=1,
86-
context={"timeout": 1000})
87-
88-
assert str(e.value) == textwrap.dedent("""
89-
HTTP Error 500: Internal Server Error
90-
Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded
90+
datasource="testdatasource",
91+
granularity="all",
92+
intervals="2015-12-29/pt1h",
93+
aggregations={"count": doublesum("count")},
94+
dimension="user_name",
95+
metric="count",
96+
filter=Dimension("user_lang") == "en",
97+
threshold=1,
98+
context={"timeout": 1000},
99+
)
100+
101+
assert (
102+
str(e.value)
103+
== textwrap.dedent(
104+
"""
105+
HTTP Error 500: Internal Server Error
106+
Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded
91107
Query is: {
92108
"aggregations": [
93109
{
@@ -112,9 +128,11 @@ def test_druid_returns_html_error(self, mock_urlopen):
112128
"queryType": "topN",
113129
"threshold": 1
114130
}
115-
""").strip()
131+
"""
132+
).strip()
133+
)
116134

117-
@patch('pydruid.client.urllib.request.urlopen')
135+
@patch("pydruid.client.urllib.request.urlopen")
118136
def test_druid_returns_results(self, mock_urlopen):
119137
# given
120138
response = Mock()
@@ -126,28 +144,32 @@ def test_druid_returns_results(self, mock_urlopen):
126144
"metric" : 100
127145
} ]
128146
} ]
129-
""".encode("utf-8")
147+
""".encode(
148+
"utf-8"
149+
)
150+
response.info.return_value = {}
130151
mock_urlopen.return_value = response
131152
client = create_client()
132153

133154
# when
134155
top = client.topn(
135-
datasource="testdatasource",
136-
granularity="all",
137-
intervals="2015-12-29/pt1h",
138-
aggregations={"count": doublesum("count")},
139-
dimension="user_name",
140-
metric="count",
141-
filter=Dimension("user_lang") == "en",
142-
threshold=1,
143-
context={"timeout": 1000})
156+
datasource="testdatasource",
157+
granularity="all",
158+
intervals="2015-12-29/pt1h",
159+
aggregations={"count": doublesum("count")},
160+
dimension="user_name",
161+
metric="count",
162+
filter=Dimension("user_lang") == "en",
163+
threshold=1,
164+
context={"timeout": 1000},
165+
)
144166

145167
# then
146168
assert top is not None
147169
assert len(top.result) == 1
148-
assert len(top.result[0]['result']) == 1
170+
assert len(top.result[0]["result"]) == 1
149171

150-
@patch('pydruid.client.urllib.request.urlopen')
172+
@patch("pydruid.client.urllib.request.urlopen")
151173
def test_client_allows_to_export_last_query(self, mock_urlopen):
152174
# given
153175
response = Mock()
@@ -159,29 +181,79 @@ def test_client_allows_to_export_last_query(self, mock_urlopen):
159181
"metric" : 100
160182
} ]
161183
} ]
162-
""".encode("utf-8")
184+
""".encode(
185+
"utf-8"
186+
)
187+
response.info.return_value = {}
163188
mock_urlopen.return_value = response
164189
client = create_client()
165190
client.topn(
166-
datasource="testdatasource",
167-
granularity="all",
168-
intervals="2015-12-29/pt1h",
169-
aggregations={"count": doublesum("count")},
170-
dimension="user_name",
171-
metric="count",
172-
filter=Dimension("user_lang") == "en",
173-
threshold=1,
174-
context={"timeout": 1000})
191+
datasource="testdatasource",
192+
granularity="all",
193+
intervals="2015-12-29/pt1h",
194+
aggregations={"count": doublesum("count")},
195+
dimension="user_name",
196+
metric="count",
197+
filter=Dimension("user_lang") == "en",
198+
threshold=1,
199+
context={"timeout": 1000},
200+
)
175201

176202
# when / then
177-
# assert that last_query.export_tsv method was called (it should throw an exception, given empty path)
203+
# assert that last_query.export_tsv method was called (it should throw an
204+
# exception, given empty path)
178205
with pytest.raises(TypeError):
179206
client.export_tsv(None)
180207

181-
@patch('pydruid.client.urllib.request.urlopen')
208+
@patch("pydruid.client.urllib.request.urlopen")
182209
def test_client_auth_creds(self, mock_urlopen):
183210
client = create_client()
184211
query = create_blank_query()
185-
client.set_basic_auth_credentials('myUsername', 'myPassword')
212+
client.set_basic_auth_credentials("myUsername", "myPassword")
213+
headers, _, _ = client._prepare_url_headers_and_body(query)
214+
assert headers["Authorization"] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk"
215+
216+
def test_client_allows_extra_headers(self):
217+
client = create_client(headers={"Accept-Encoding": "gzip"})
218+
query = create_blank_query()
186219
headers, _, _ = client._prepare_url_headers_and_body(query)
187-
assert headers['Authorization'] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk"
220+
assert headers["Accept-Encoding"] == "gzip"
221+
222+
@patch("pydruid.client.urllib.request.urlopen")
223+
def test_return_compressed_data(self, mock_urlopen):
224+
# given
225+
response = Mock()
226+
response.read.return_value = compress(
227+
"""
228+
[ {
229+
"timestamp" : "2015-12-30T14:14:49.000Z",
230+
"result" : [ {
231+
"dimension" : "aaaa",
232+
"metric" : 100
233+
} ]
234+
} ]
235+
""".encode(
236+
"utf-8"
237+
)
238+
)
239+
response.info.return_value = {"Content-Encoding": "gzip"}
240+
mock_urlopen.return_value = response
241+
client = create_client(headers={"Accept-Encoding": "gzip"})
242+
243+
# when
244+
top = client.topn(
245+
datasource="testdatasource",
246+
granularity="all",
247+
intervals="2015-12-29/pt1h",
248+
aggregations={"count": doublesum("count")},
249+
dimension="user_name",
250+
metric="count",
251+
filter=Dimension("user_lang") == "en",
252+
threshold=1,
253+
context={"timeout": 1000},
254+
)
255+
256+
# then
257+
assert top is not None
258+
assert len(top.result) == 1
259+
assert len(top.result[0]["result"]) == 1

0 commit comments

Comments
 (0)