-
Notifications
You must be signed in to change notification settings - Fork 601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
iceberg: add batching parquet writer factory #23683
Changes from 1 commit
b66741a
ff42c26
a08a847
92d1dff
bb40926
91a3c70
06e20c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,8 @@ | |
#include <seastar/coroutine/as_future.hh> | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
#include <utility> | ||
|
||
namespace datalake { | ||
|
||
|
@@ -159,4 +161,29 @@ ss::future<> batching_parquet_writer::abort() { | |
} | ||
} | ||
|
||
batching_parquet_writer_factory::batching_parquet_writer_factory( | ||
std::filesystem::path local_directory, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
todo for future: As we chatted offline, this is internal to the writer, so it should self manage this path and its lifecycle |
||
ss::sstring file_name_prefix, | ||
size_t row_count_threshold, | ||
size_t byte_count_threshold) | ||
: _local_directory{std::move(local_directory)} | ||
, _file_name_prefix{std::move(file_name_prefix)} | ||
, _row_count_threshold{row_count_threshold} | ||
, _byte_count_treshold{byte_count_threshold} {} | ||
|
||
ss::future<std::unique_ptr<data_writer>> | ||
batching_parquet_writer_factory::create_writer(iceberg::struct_type schema) { | ||
auto ret = std::make_unique<batching_parquet_writer>( | ||
std::move(schema), _row_count_threshold, _byte_count_treshold); | ||
std::string filename = fmt::format( | ||
"{}-{}.parquet", _file_name_prefix, uuid_t::create()); | ||
std::filesystem::path file_path = _local_directory / filename; | ||
auto err = co_await ret->initialize(file_path); | ||
if (err != data_writer_error::ok) { | ||
// FIXME: This method should return a result and let the multiplexer | ||
// deal with it appropriately | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably have thrown at this level instead of the next layer up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean by "this level" you mean throw an exception up to the multiplexer? |
||
co_return nullptr; | ||
} | ||
co_return ret; | ||
} | ||
} // namespace datalake |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
#include <base/seastarx.h> | ||
|
||
#include <filesystem> | ||
#include <memory> | ||
|
||
namespace datalake { | ||
// batching_parquet_writer ties together the low-level components for iceberg to | ||
|
@@ -75,4 +76,22 @@ class batching_parquet_writer : public data_writer { | |
data_writer_result _result; | ||
}; | ||
|
||
class batching_parquet_writer_factory : public data_writer_factory { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. optional: could have also made this a nested class so it would be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given the number of things stacked above this, and how pervasive a change that would be, I'd rather make it a separate PR. |
||
public: | ||
batching_parquet_writer_factory( | ||
std::filesystem::path local_directory, | ||
ss::sstring file_name_prefix, | ||
size_t row_count_threshold, | ||
size_t byte_count_threshold); | ||
|
||
ss::future<std::unique_ptr<data_writer>> | ||
create_writer(iceberg::struct_type schema) override; | ||
|
||
private: | ||
std::filesystem::path _local_directory; | ||
ss::sstring _file_name_prefix; | ||
size_t _row_count_threshold; | ||
size_t _byte_count_treshold; | ||
}; | ||
|
||
} // namespace datalake |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,9 @@ | |
#include "datalake/record_multiplexer.h" | ||
|
||
#include "datalake/data_writer_interface.h" | ||
#include "datalake/logger.h" | ||
#include "datalake/schemaless_translator.h" | ||
#include "datalake/tests/test_data_writer.h" | ||
#include "iceberg/values.h" | ||
#include "model/record.h" | ||
#include "storage/parser_utils.h" | ||
|
@@ -49,13 +51,18 @@ record_multiplexer::operator()(model::record_batch batch) { | |
iceberg::struct_value data = translator.translate_event( | ||
std::move(key), std::move(val), timestamp, offset); | ||
// Send it to the writer | ||
auto& writer = get_writer(); | ||
data_writer_error writer_status = co_await writer.add_data_struct( | ||
std::move(data), estimated_size); | ||
if (writer_status != data_writer_error::ok) { | ||
|
||
try { | ||
auto& writer = co_await get_writer(); | ||
_writer_status = co_await writer.add_data_struct( | ||
std::move(data), estimated_size); | ||
} catch (const std::runtime_error& err) { | ||
datalake_log.error("Failed to add data to writer"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Also we should set the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think _writer_status should probably be initialized to a generic_error and be set to ok when it actually finished. |
||
_writer_status =data_writer_error::parquet_conversion_error; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: space after = |
||
} | ||
if (_writer_status != data_writer_error::ok) { | ||
// If a write fails, the writer is left in an indeterminate state, | ||
// we cannot continue in this case. | ||
_writer_status = writer_status; | ||
co_return ss::stop_iteration::yes; | ||
} | ||
} | ||
|
@@ -86,12 +93,18 @@ schemaless_translator& record_multiplexer::get_translator() { | |
return _translator; | ||
} | ||
|
||
data_writer& record_multiplexer::get_writer() { | ||
ss::future<data_writer&> record_multiplexer::get_writer() { | ||
if (!_writer) { | ||
auto& translator = get_translator(); | ||
auto schema = translator.get_schema(); | ||
_writer = _writer_factory->create_writer(std::move(schema)); | ||
_writer = co_await _writer_factory->create_writer(std::move(schema)); | ||
if (!_writer) { | ||
// FIXME: modify create_writer to return a result and check that | ||
// here. This method should also return a result. That is coming in | ||
// one of the next commits. For now throw & catch. | ||
throw std::runtime_error("Could not create data writer"); | ||
} | ||
} | ||
return *_writer; | ||
co_return *_writer; | ||
} | ||
} // namespace datalake |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Architecturally, I believe it will be simpler to keep these parquet files in memory. AFAIK we're going to load these fully into memory already to send them, and it's fairly straightforward to have a semaphore to make sure we stay within the subsystem's memory budget. Writing to disk, especially with our trends of smaller and smaller disks, is going to come with a number of challenges: cleaning up zombie files, integrating with space management, etc.
Now we can do this after the beta phase, but we should keep this in mind as we're structuring the codepaths.
However
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that we do want to stream directly to s3 eventually. I'd rather stick with this for now so we can get an end-to-end test asap.
FWIW, we can stream from disk to S3, we don't need to load it all into memory first. cloud_storage::remote::upload_controller_snapshot is an example of that.