Skip to content

Commit

Permalink
Fix for issue duckdb#14648 (duckdb#15409)
Browse files Browse the repository at this point in the history
  • Loading branch information
hannes authored Dec 19, 2024
2 parents e4e50f9 + c5666c6 commit ab8c909
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 8 deletions.
3 changes: 3 additions & 0 deletions data/csv/comments/14648.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
a,b
a#b
x,y
2 changes: 1 addition & 1 deletion src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace duckdb {
CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
CSVStateMachineCache &state_machine_cache_p, bool default_null_to_varchar_p)
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
default_null_to_varchar(default_null_to_varchar_p) {
lines_sniffed(0), default_null_to_varchar(default_null_to_varchar_p) {
// Initialize Format Candidates
for (const auto &format_template : format_template_candidates) {
auto &logical_type = format_template.first;
Expand Down
15 changes: 9 additions & 6 deletions src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ string DialectCandidates::Print() {

DialectCandidates::DialectCandidates(const CSVStateMachineOptions &options) {
// assert that quotes escapes and rules have equal size
auto default_quote = GetDefaultQuote();
auto default_escape = GetDefaultEscape();
auto default_quote_rule = GetDefaultQuoteRule();
auto default_delimiter = GetDefaultDelimiter();
auto default_comment = GetDefaultComment();
const auto default_quote = GetDefaultQuote();
const auto default_escape = GetDefaultEscape();
const auto default_quote_rule = GetDefaultQuoteRule();
const auto default_delimiter = GetDefaultDelimiter();
const auto default_comment = GetDefaultComment();

D_ASSERT(default_quote.size() == default_quote_rule.size() && default_quote_rule.size() == default_escape.size());
// fill the escapes
Expand Down Expand Up @@ -187,6 +187,9 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc

// Returns true if a comment is acceptable
bool AreCommentsAcceptable(const ColumnCountResult &result, idx_t num_cols, bool comment_set_by_user) {
if (comment_set_by_user) {
return true;
}
// For a comment to be acceptable, we want 3/5th's the majority of unmatched in the columns
constexpr double min_majority = 0.6;
// detected comments, are all lines that started with a comment character.
Expand All @@ -208,7 +211,7 @@ bool AreCommentsAcceptable(const ColumnCountResult &result, idx_t num_cols, bool
}
}
// If we do not encounter at least one full line comment, we do not consider this comment option.
if (valid_comments == 0 || (!has_full_line_comment && !comment_set_by_user)) {
if (valid_comments == 0 || !has_full_line_comment) {
// this is only valid if our comment character is \0
if (result.state_machine.state_machine_options.comment.GetValue() == '\0') {
return true;
Expand Down
36 changes: 35 additions & 1 deletion test/sql/copy/csv/test_comment_midline.test
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,39 @@
statement ok
PRAGMA enable_verification

query II
from read_csv('data/csv/comments/14648.csv',
header=false,
sep=',',
null_padding=true,
comment='#',
ignore_errors=true
);
----
a b
a NULL
x y

query II
from read_csv('data/csv/comments/14648.csv',
header=false,
sep=',',
comment='#',
ignore_errors=true
);
----
a b
x y

statement error
from read_csv('data/csv/comments/14648.csv',
header=false,
sep=',',
comment='#'
);
----
Error when sniffing file "data/csv/comments/14648.csv".

# If we only have midline comments, the comment must be explicitly given by the user
query I
FROM 'data/csv/comments/only_midline.csv';
Expand Down Expand Up @@ -95,4 +128,5 @@ FROM read_csv('data/csv/comments/simple_mid_line.csv', buffer_size = ${buffer_si
1 3
6 7

endloop
endloop

0 comments on commit ab8c909

Please sign in to comment.