Skip to content

Commit 1d9a16e

Browse files
committed
Backport downloader features from red-datasets
1 parent 511f66d commit 1d9a16e

File tree

2 files changed

+140
-21
lines changed

2 files changed

+140
-21
lines changed

lib/remote_input/downloader.rb

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,20 @@ module RemoteInput
1212
class Downloader
1313
class TooManyRedirects < Error; end
1414

15-
def initialize(url)
16-
if url.is_a?(URI::Generic)
17-
url = url.dup
18-
else
19-
url = URI.parse(url)
20-
end
21-
@url = url
22-
unless @url.is_a?(URI::HTTP)
23-
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
24-
end
15+
def initialize(url, *fallback_urls, http_method: nil, http_parameters: nil)
16+
@url = normalize_url(url)
17+
@fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
18+
@http_method = http_method
19+
@http_parameters = http_parameters
2520
end
2621

2722
def download(output_path, &block)
28-
if output_path.exist?
29-
yield_chunks(output_path, &block) if block_given?
30-
return
31-
end
23+
return if use_cache(output_path, &block)
3224

3325
partial_output_path = Pathname.new("#{output_path}.partial")
3426
synchronize(output_path, partial_output_path) do
27+
return if use_cache(output_path, &block)
28+
3529
output_path.parent.mkpath
3630

3731
n_retries = 0
@@ -47,7 +41,7 @@ def download(output_path, &block)
4741
headers["Range"] = "bytes=#{start}-"
4842
end
4943

50-
start_http(@url, headers) do |response|
44+
start_http(@url, @fallback_urls, headers) do |response|
5145
if response.is_a?(Net::HTTPPartialContent)
5246
mode = "ab"
5347
else
@@ -87,6 +81,27 @@ def download(output_path, &block)
8781
end
8882
end
8983

84+
private def normalize_url(url)
85+
if url.is_a?(URI::Generic)
86+
url = url.dup
87+
else
88+
url = URI.parse(url)
89+
end
90+
unless url.is_a?(URI::HTTP)
91+
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
92+
end
93+
url
94+
end
95+
96+
private def use_cache(output_path, &block)
97+
if output_path.exist?
98+
yield_chunks(output_path, &block) if block_given?
99+
true
100+
else
101+
false
102+
end
103+
end
104+
90105
private def synchronize(output_path, partial_output_path)
91106
begin
92107
Process.getpgid(Process.pid)
@@ -106,7 +121,8 @@ def download(output_path, &block)
106121
rescue ArgumentError
107122
# The process that acquired the lock will be exited before
108123
# it stores its process ID.
109-
valid_lock_path = (lock_path.mtime > 10)
124+
elapsed_time = Time.now - lock_path.mtime
125+
valid_lock_path = (elapsed_time > 10)
110126
else
111127
begin
112128
Process.getpgid(pid)
@@ -135,7 +151,7 @@ def download(output_path, &block)
135151
end
136152
end
137153

138-
private def start_http(url, headers, limit = 10, &block)
154+
private def start_http(url, fallback_urls, headers, limit = 10, &block)
139155
if limit == 0
140156
raise TooManyRedirects, "too many redirections: #{url}"
141157
end
@@ -145,16 +161,47 @@ def download(output_path, &block)
145161
http.start do
146162
path = url.path
147163
path += "?#{url.query}" if url.query
148-
request = Net::HTTP::Get.new(path, headers)
164+
if @http_method == :post
165+
# TODO: We may want to add @http_content_type, @http_body
166+
# and so on.
167+
if @http_parameters
168+
body = URI.encode_www_form(@http_parameters)
169+
content_type = "application/x-www-form-urlencoded"
170+
headers = {"Content-Type" => content_type}.merge(headers)
171+
else
172+
body = ""
173+
end
174+
request = Net::HTTP::Post.new(path, headers)
175+
request.body = body
176+
else
177+
request = Net::HTTP::Get.new(path, headers)
178+
end
179+
if url.scheme == "https" and url.host == "api.github.com"
180+
gh_token = ENV["GH_TOKEN"]
181+
if gh_token
182+
headers = headers.merge("Authorization" => "Bearer #{gh_token}")
183+
end
184+
end
149185
http.request(request) do |response|
150186
case response
151187
when Net::HTTPSuccess, Net::HTTPPartialContent
152188
return block.call(response)
153189
when Net::HTTPRedirection
154190
url = URI.parse(response[:location])
155191
$stderr.puts "Redirect to #{url}"
156-
return start_http(url, headers, limit - 1, &block)
192+
return start_http(url, fallback_urls, headers, limit - 1, &block)
157193
else
194+
case response
195+
when Net::HTTPForbidden, Net::HTTPNotFound
196+
next_url, *rest_fallback_urls = fallback_urls
197+
if next_url
198+
message = "#{response.code}: #{response.message}: " +
199+
"fallback: <#{url}> -> <#{next_url}>"
200+
$stderr.puts(message)
201+
return start_http(next_url, rest_fallback_urls, headers, &block)
202+
end
203+
end
204+
158205
message = response.code
159206
if response.message and not response.message.empty?
160207
message += ": #{response.message}"
@@ -169,7 +216,7 @@ def download(output_path, &block)
169216
private def yield_chunks(path)
170217
path.open("rb") do |output|
171218
chunk_size = 1024 * 1024
172-
chunk = ""
219+
chunk = +""
173220
while output.read(chunk_size, chunk)
174221
yield(chunk)
175222
end

test/test-downloader.rb

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,43 @@
11
class DownloaderTest < Test::Unit::TestCase
22
include Helper::Sandbox
33

4+
sub_test_case("#initialize") do
5+
test("single URL") do
6+
url = "https://example.com/file"
7+
downloader = RemoteInput::Downloader.new(url)
8+
assert_equal(URI.parse(url), downloader.instance_variable_get(:@url))
9+
assert_equal([], downloader.instance_variable_get(:@fallback_urls))
10+
end
11+
12+
test("with fallback URLs") do
13+
url = "https://example.com/file"
14+
fallback1 = "https://mirror1.example.com/file"
15+
fallback2 = "https://mirror2.example.com/file"
16+
downloader = RemoteInput::Downloader.new(url, fallback1, fallback2)
17+
18+
assert_equal(URI.parse(url), downloader.instance_variable_get(:@url))
19+
assert_equal([URI.parse(fallback1), URI.parse(fallback2)],
20+
downloader.instance_variable_get(:@fallback_urls))
21+
end
22+
23+
test("with HTTP method and parameters") do
24+
url = "https://example.com/api"
25+
parameters = { key: "value", data: "test" }
26+
downloader = RemoteInput::Downloader.new(url,
27+
http_method: :post,
28+
http_parameters: parameters)
29+
30+
assert_equal(:post, downloader.instance_variable_get(:@http_method))
31+
assert_equal(parameters, downloader.instance_variable_get(:@http_parameters))
32+
end
33+
34+
test("invalid URL") do
35+
assert_raise(ArgumentError) do
36+
RemoteInput::Downloader.new("ftp://example.com/file")
37+
end
38+
end
39+
end
40+
441
sub_test_case("#download") do
542
def setup
643
setup_sandbox
@@ -17,13 +54,48 @@ def teardown
1754
output_path = @tmp_dir + "file"
1855
downloader = RemoteInput::Downloader.new(first_url)
1956

20-
downloader.define_singleton_method(:start_http) do |url, headers|
57+
downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
2158
raise RemoteInput::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
2259
end
2360

2461
assert_raise(RemoteInput::Downloader::TooManyRedirects.new(expected_message)) do
2562
downloader.download(output_path)
2663
end
2764
end
65+
66+
test("use cache when file exists") do
67+
output_path = @tmp_dir + "cached_file"
68+
output_path.write("cached content")
69+
70+
downloader = RemoteInput::Downloader.new("https://example.com/file")
71+
72+
# Should not call start_http when file exists
73+
downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
74+
flunk("start_http should not be called when file exists")
75+
end
76+
77+
downloader.download(output_path)
78+
assert_equal("cached content", output_path.read)
79+
end
80+
end
81+
82+
sub_test_case("fallback URLs") do
83+
def setup
84+
setup_sandbox
85+
end
86+
87+
def teardown
88+
teardown_sandbox
89+
end
90+
91+
test("fallback URLs are stored correctly") do
92+
main_url = "https://example.com/file"
93+
fallback_url = "https://mirror.example.com/file"
94+
95+
downloader = RemoteInput::Downloader.new(main_url, fallback_url)
96+
97+
fallback_urls = downloader.instance_variable_get(:@fallback_urls)
98+
assert_equal([URI.parse(fallback_url)], fallback_urls)
99+
end
28100
end
29101
end

0 commit comments

Comments
 (0)