Skip to content

Commit 42f27ab

Browse files
authored
GH-47978: [C++][Parquet][CI] Add more compression codecs to fuzzing seed corpus (#47979)
### Rationale for this change 1. Add more compression codecs to seed corpus 2. Tweak fuzz target to make fuzzing slightly faster (around ~30% locally according to my measurements), which will allow testing more mutations per day ### Are these changes tested? Not specifically by CI, but hopefully they will make fuzzing more efficient. ### Are there any user-facing changes? No. * GitHub Issue: #47978 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent b2190db commit 42f27ab

File tree

3 files changed

+73
-24
lines changed

3 files changed

+73
-24
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,25 +81,68 @@ std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
8181
}
8282

8383
std::vector<WriteConfig> GetWriteConfigurations() {
84+
auto default_properties_builder = [] {
85+
auto builder = WriterProperties::Builder();
86+
// Override current default of 1MB
87+
builder.data_pagesize(10'000);
88+
// Reduce max dictionary page size so that less pages are dict-encoded.
89+
builder.dictionary_pagesize_limit(1'000);
90+
// Emit various physical types for decimal columns
91+
builder.enable_store_decimal_as_integer();
92+
// DataPageV2 has more interesting features such as selective compression
93+
builder.data_page_version(parquet::ParquetDataPageVersion::V2);
94+
return builder;
95+
};
96+
97+
auto default_arrow_properties_builder = [] {
98+
auto builder = ArrowWriterProperties::Builder();
99+
// Store the Arrow schema so as to exercise more data types when reading
100+
builder.store_schema();
101+
return builder;
102+
};
103+
84104
// clang-format off
85-
auto w_brotli = WriterProperties::Builder()
86-
.disable_dictionary("no_dict")
87-
->compression("compressed", Compression::BROTLI)
88-
// Override current default of 1MB
89-
->data_pagesize(20'000)
90-
// Reduce max dictionary page size so that less pages are dict-encoded.
91-
->dictionary_pagesize_limit(1'000)
92-
// Emit various physical types for decimal columns
93-
->enable_store_decimal_as_integer()
105+
auto w_uncompressed = default_properties_builder()
106+
.build();
107+
// compressed columns with dictionary disabled
108+
auto w_brotli = default_properties_builder()
109+
.disable_dictionary()
110+
->compression(Compression::BROTLI)
111+
->build();
112+
auto w_gzip = default_properties_builder()
113+
.disable_dictionary()
114+
->compression(Compression::GZIP)
94115
->build();
95-
// Store the Arrow schema so as to exercise more data types when reading
96-
auto a_default = ArrowWriterProperties::Builder{}
97-
.store_schema()
116+
auto w_lz4 = default_properties_builder()
117+
.disable_dictionary()
118+
->compression(Compression::LZ4)
98119
->build();
120+
auto w_snappy = default_properties_builder()
121+
.disable_dictionary()
122+
->compression(Compression::SNAPPY)
123+
->build();
124+
auto w_zstd = default_properties_builder()
125+
.disable_dictionary()
126+
->compression(Compression::ZSTD)
127+
->build();
128+
// v1 data pages
129+
auto w_pages_v1 = default_properties_builder()
130+
.disable_dictionary()
131+
->compression(Compression::LZ4)
132+
->data_page_version(parquet::ParquetDataPageVersion::V1)
133+
->build();
134+
135+
auto a_default = default_arrow_properties_builder().build();
99136
// clang-format on
100137

101138
std::vector<WriteConfig> configs;
139+
configs.push_back({w_uncompressed, a_default});
102140
configs.push_back({w_brotli, a_default});
141+
configs.push_back({w_gzip, a_default});
142+
configs.push_back({w_lz4, a_default});
143+
configs.push_back({w_snappy, a_default});
144+
configs.push_back({w_zstd, a_default});
145+
configs.push_back({w_pages_v1, a_default});
103146
return configs;
104147
}
105148

@@ -255,8 +298,6 @@ Result<std::vector<Column>> ExampleColumns(int32_t length,
255298

256299
// TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
257300

258-
// A non-dict-encoded column (see GetWriteConfigurations)
259-
columns.push_back({"no_dict", gen.String(length, 0, 30, null_probability)});
260301
// A column that should be quite compressible (see GetWriteConfigurations)
261302
columns.push_back({"compressed", gen.Int64(length, -10, 10, null_probability)});
262303

cpp/src/parquet/arrow/reader.cc

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,21 +1414,32 @@ Status FuzzReader(std::unique_ptr<FileReader> reader) {
14141414
} // namespace
14151415

14161416
Status FuzzReader(const uint8_t* data, int64_t size) {
1417-
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
14181417
Status st;
1419-
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 1, 13, 300}) {
1420-
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
1421-
FileReaderBuilder builder;
1418+
1419+
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
1420+
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
1421+
auto pool = ::arrow::default_memory_pool();
1422+
1423+
// Read Parquet file metadata only once, which will reduce iteration time slightly
1424+
std::shared_ptr<FileMetaData> pq_md;
1425+
BEGIN_PARQUET_CATCH_EXCEPTIONS
1426+
pq_md = ParquetFileReader::Open(file)->metadata();
1427+
END_PARQUET_CATCH_EXCEPTIONS
1428+
1429+
// Note that very small batch sizes probably make fuzzing slower
1430+
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
14221431
ArrowReaderProperties properties;
14231432
if (batch_size) {
14241433
properties.set_batch_size(batch_size.value());
14251434
}
1426-
builder.properties(properties);
14271435

1428-
RETURN_NOT_OK(builder.Open(std::move(file)));
1436+
std::unique_ptr<ParquetFileReader> pq_file_reader;
1437+
BEGIN_PARQUET_CATCH_EXCEPTIONS
1438+
pq_file_reader = ParquetFileReader::Open(file, default_reader_properties(), pq_md);
1439+
END_PARQUET_CATCH_EXCEPTIONS
14291440

14301441
std::unique_ptr<FileReader> reader;
1431-
RETURN_NOT_OK(builder.Build(&reader));
1442+
RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
14321443
st &= FuzzReader(std::move(reader));
14331444
}
14341445
return st;

cpp/src/parquet/properties.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,6 @@ class PARQUET_EXPORT WriterProperties {
324324
content_defined_chunking_options_(
325325
properties.content_defined_chunking_options()) {}
326326

327-
virtual ~Builder() {}
328-
329327
/// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
330328
///
331329
/// Optimize parquet files for content addressable storage (CAS) systems by writing
@@ -1198,7 +1196,6 @@ class PARQUET_EXPORT ArrowWriterProperties {
11981196
use_threads_(kArrowDefaultUseThreads),
11991197
executor_(NULLPTR),
12001198
write_time_adjusted_to_utc_(false) {}
1201-
virtual ~Builder() = default;
12021199

12031200
/// \brief Disable writing legacy int96 timestamps (default disabled).
12041201
Builder* disable_deprecated_int96_timestamps() {

0 commit comments

Comments
 (0)