-
Notifications
You must be signed in to change notification settings - Fork 601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
iceberg: add batching parquet writer factory #23683
Changes from all commits
b66741a
ff42c26
a08a847
92d1dff
bb40926
91a3c70
06e20c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,9 +10,11 @@ | |
|
||
#include "datalake/batching_parquet_writer.h" | ||
|
||
#include "base/vlog.h" | ||
#include "bytes/iostream.h" | ||
#include "datalake/arrow_translator.h" | ||
#include "datalake/data_writer_interface.h" | ||
#include "datalake/logger.h" | ||
|
||
#include <seastar/core/file-types.hh> | ||
#include <seastar/core/fstream.hh> | ||
|
@@ -22,6 +24,9 @@ | |
#include <seastar/coroutine/as_future.hh> | ||
|
||
#include <cstdint> | ||
#include <exception> | ||
#include <memory> | ||
#include <utility> | ||
|
||
namespace datalake { | ||
|
||
|
@@ -43,20 +48,30 @@ batching_parquet_writer::initialize(std::filesystem::path output_file_path) { | |
ss::open_flags::create | ss::open_flags::truncate | ||
| ss::open_flags::wo); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error opening output file {}: {}", | ||
_output_file_path, | ||
std::current_exception()); | ||
co_return data_writer_error::file_io_error; | ||
} | ||
bool error = false; | ||
try { | ||
_output_stream = co_await ss::make_file_output_stream(_output_file); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error making output stream for file {}: {}", | ||
_output_file_path, | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
co_await _output_file.close(); | ||
co_return data_writer_error::file_io_error; | ||
} | ||
|
||
_result.file_path = _output_file_path.string(); | ||
_result.remote_path = _output_file_path.string(); | ||
co_return data_writer_error::ok; | ||
} | ||
|
||
|
@@ -66,6 +81,10 @@ ss::future<data_writer_error> batching_parquet_writer::add_data_struct( | |
try { | ||
_iceberg_to_arrow.add_data(std::move(data)); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error adding data value to Arrow table: {}", | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
|
@@ -83,7 +102,7 @@ ss::future<data_writer_error> batching_parquet_writer::add_data_struct( | |
co_return data_writer_error::ok; | ||
} | ||
|
||
ss::future<result<data_writer_result, data_writer_error>> | ||
ss::future<result<coordinator::data_file, data_writer_error>> | ||
batching_parquet_writer::finish() { | ||
auto write_result = co_await write_row_group(); | ||
if (write_result != data_writer_error::ok) { | ||
|
@@ -96,6 +115,10 @@ batching_parquet_writer::finish() { | |
out = _arrow_to_iobuf.close_and_take_iobuf(); | ||
_result.file_size_bytes += out.size_bytes(); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error closing arrow_to_iobuf stream: {}", | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
|
@@ -107,6 +130,10 @@ batching_parquet_writer::finish() { | |
co_await write_iobuf_to_output_stream(std::move(out), _output_stream); | ||
co_await _output_stream.close(); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error closing output stream: {}", | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
|
@@ -126,13 +153,17 @@ ss::future<data_writer_error> batching_parquet_writer::write_row_group() { | |
iobuf out; | ||
try { | ||
auto chunk = _iceberg_to_arrow.take_chunk(); | ||
_result.record_count += _row_count; | ||
_result.row_count += _row_count; | ||
_row_count = 0; | ||
_byte_count = 0; | ||
_arrow_to_iobuf.add_arrow_array(chunk); | ||
out = _arrow_to_iobuf.take_iobuf(); | ||
_result.file_size_bytes += out.size_bytes(); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error converting Arrow to Parquet iobuf: {}", | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
|
@@ -142,6 +173,10 @@ ss::future<data_writer_error> batching_parquet_writer::write_row_group() { | |
try { | ||
co_await write_iobuf_to_output_stream(std::move(out), _output_stream); | ||
} catch (...) { | ||
vlog( | ||
datalake_log.error, | ||
"Error writing to output stream: {}", | ||
std::current_exception()); | ||
error = true; | ||
} | ||
if (error) { | ||
|
@@ -159,4 +194,27 @@ ss::future<> batching_parquet_writer::abort() { | |
} | ||
} | ||
|
||
batching_parquet_writer_factory::batching_parquet_writer_factory( | ||
std::filesystem::path local_directory, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
todo for future: As we chatted offline, this is internal to the writer, so it should self manage this path and its lifecycle |
||
ss::sstring file_name_prefix, | ||
size_t row_count_threshold, | ||
size_t byte_count_threshold) | ||
: _local_directory{std::move(local_directory)} | ||
, _file_name_prefix{std::move(file_name_prefix)} | ||
, _row_count_threshold{row_count_threshold} | ||
, _byte_count_treshold{byte_count_threshold} {} | ||
|
||
ss::future<result<ss::shared_ptr<data_writer>, data_writer_error>> | ||
batching_parquet_writer_factory::create_writer(iceberg::struct_type schema) { | ||
auto ret = ss::make_shared<batching_parquet_writer>( | ||
mmaslankaprv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::move(schema), _row_count_threshold, _byte_count_treshold); | ||
std::string filename = fmt::format( | ||
"{}-{}.parquet", _file_name_prefix, uuid_t::create()); | ||
std::filesystem::path file_path = _local_directory / filename; | ||
auto err = co_await ret->initialize(file_path); | ||
if (err != data_writer_error::ok) { | ||
co_return err; | ||
} | ||
co_return ret; | ||
} | ||
} // namespace datalake |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
#include <base/seastarx.h> | ||
|
||
#include <filesystem> | ||
#include <memory> | ||
|
||
namespace datalake { | ||
// batching_parquet_writer ties together the low-level components for iceberg to | ||
|
@@ -49,7 +50,8 @@ class batching_parquet_writer : public data_writer { | |
ss::future<data_writer_error> | ||
add_data_struct(iceberg::struct_value data, int64_t approx_size) override; | ||
|
||
ss::future<result<data_writer_result, data_writer_error>> finish() override; | ||
ss::future<result<coordinator::data_file, data_writer_error>> | ||
finish() override; | ||
|
||
// Close the file handle, delete any temporary data and clean up any other | ||
// state. | ||
|
@@ -72,7 +74,25 @@ class batching_parquet_writer : public data_writer { | |
std::filesystem::path _output_file_path; | ||
ss::file _output_file; | ||
ss::output_stream<char> _output_stream; | ||
data_writer_result _result; | ||
coordinator::data_file _result; | ||
}; | ||
|
||
class batching_parquet_writer_factory : public data_writer_factory { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. optional: could have also made this a nested class so it would be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given the number of things stacked above this, and how pervasive a change that would be, I'd rather make it a separate PR. |
||
public: | ||
batching_parquet_writer_factory( | ||
std::filesystem::path local_directory, | ||
ss::sstring file_name_prefix, | ||
size_t row_count_threshold, | ||
size_t byte_count_threshold); | ||
|
||
ss::future<result<ss::shared_ptr<data_writer>, data_writer_error>> | ||
create_writer(iceberg::struct_type schema) override; | ||
|
||
private: | ||
std::filesystem::path _local_directory; | ||
ss::sstring _file_name_prefix; | ||
size_t _row_count_threshold; | ||
size_t _byte_count_treshold; | ||
}; | ||
|
||
} // namespace datalake |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
#pragma once | ||
|
||
#include "base/outcome.h" | ||
#include "coordinator/data_file.h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. datalake/coordinator/data_file.h |
||
#include "datalake/schemaless_translator.h" | ||
#include "iceberg/datatypes.h" | ||
#include "iceberg/values.h" | ||
|
@@ -23,19 +24,7 @@ enum class data_writer_error { | |
ok = 0, | ||
parquet_conversion_error, | ||
file_io_error, | ||
|
||
}; | ||
|
||
struct data_writer_result | ||
: serde::envelope< | ||
data_writer_result, | ||
serde::version<0>, | ||
serde::compat_version<0>> { | ||
ss::sstring file_path = ""; | ||
size_t record_count = 0; | ||
size_t file_size_bytes = 0; | ||
|
||
auto serde_fields() { return std::tie(record_count); } | ||
no_data, | ||
}; | ||
|
||
struct data_writer_error_category : std::error_category { | ||
|
@@ -49,6 +38,8 @@ struct data_writer_error_category : std::error_category { | |
return "Parquet Conversion Error"; | ||
case data_writer_error::file_io_error: | ||
return "File IO Error"; | ||
case data_writer_error::no_data: | ||
return "No data"; | ||
} | ||
} | ||
|
||
|
@@ -70,15 +61,15 @@ class data_writer { | |
iceberg::struct_value /* data */, int64_t /* approx_size */) | ||
= 0; | ||
|
||
virtual ss::future<result<data_writer_result, data_writer_error>> | ||
virtual ss::future<result<coordinator::data_file, data_writer_error>> | ||
finish() = 0; | ||
}; | ||
|
||
class data_writer_factory { | ||
public: | ||
virtual ~data_writer_factory() = default; | ||
|
||
virtual std::unique_ptr<data_writer> | ||
virtual ss::future<result<ss::shared_ptr<data_writer>, data_writer_error>> | ||
create_writer(iceberg::struct_type /* schema */) = 0; | ||
}; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Architecturally, I believe it will be simpler to keep these parquet files in memory. AFAIK we're going to load these fully into memory already to send them, and it's fairly straightforward to have a semaphore to make sure we stay within the subsystem's memory budget. Writing to disk, especially with our trends of smaller and smaller disks, is going to come with a number of challenges: cleaning up zombie files, integrating with space management, etc.
Now we can do this after the beta phase, but we should keep this in mind as we're structuring the codepaths.
However
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that we do want to stream directly to s3 eventually. I'd rather stick with this for now so we can get an end-to-end test asap.
FWIW, we can stream from disk to S3, we don't need to load it all into memory first. cloud_storage::remote::upload_controller_snapshot is an example of that.