Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/query/formats/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ match-template = { workspace = true }
micromarshal = { workspace = true }
num = { workspace = true }
num-traits = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }

[dev-dependencies]
Expand Down
44 changes: 37 additions & 7 deletions src/query/formats/src/field_decoder/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ use databend_common_io::parse_bitmap;
use databend_common_io::parse_bytes_to_ewkb;
use jsonb::parse_owned_jsonb_with_buf;
use lexical_core::FromLexical;
use serde::Deserialize;
use serde_json::Deserializer;
use serde_json::value::RawValue;

use crate::FileFormatOptionsExt;
use crate::InputCommonSettings;
Expand Down Expand Up @@ -209,7 +212,7 @@ impl NestedValues {
column: &mut StringColumnBuilder,
reader: &mut Cursor<R>,
) -> Result<()> {
reader.read_quoted_text(&mut column.row_buffer, b'\'')?;
self.read_string_inner(reader, &mut column.row_buffer)?;
column.commit_row();
Ok(())
}
Expand All @@ -220,7 +223,7 @@ impl NestedValues {
reader: &mut Cursor<R>,
) -> Result<()> {
let mut buf = Vec::new();
reader.read_quoted_text(&mut buf, b'\'')?;
self.read_string_inner(reader, &mut buf)?;
let decoded = decode_binary(&buf, self.common_settings.binary_format)?;
column.put_slice(&decoded);
column.commit_row();
Expand All @@ -232,7 +235,10 @@ impl NestedValues {
reader: &mut Cursor<R>,
out_buf: &mut Vec<u8>,
) -> Result<()> {
reader.read_quoted_text(out_buf, b'\'')?;
if reader.read_quoted_text(out_buf, b'"').is_err() {
// Read single quoted text, compatible with previous implementations
reader.read_quoted_text(out_buf, b'\'')?;
}
Ok(())
}

Expand Down Expand Up @@ -320,8 +326,13 @@ impl NestedValues {
column: &mut BinaryColumnBuilder,
reader: &mut Cursor<R>,
) -> Result<()> {
let mut buf = Vec::new();
self.read_string_inner(reader, &mut buf)?;
let buf = if let Ok(val) = self.read_json(reader) {
val.as_bytes().to_vec()
} else {
let mut buf = Vec::new();
reader.read_quoted_text(&mut buf, b'\'')?;
buf
};
match parse_owned_jsonb_with_buf(&buf, &mut column.data) {
Ok(_) => {
column.commit_row();
Expand All @@ -343,7 +354,12 @@ impl NestedValues {
reader: &mut Cursor<R>,
) -> Result<()> {
let mut buf = Vec::new();
self.read_string_inner(reader, &mut buf)?;
if reader.read_quoted_text(&mut buf, b'"').is_err()
&& reader.read_quoted_text(&mut buf, b'\'').is_err()
{
let val = self.read_json(reader)?;
buf = val.as_bytes().to_vec();
}
let geom = parse_bytes_to_ewkb(&buf, None)?;
column.put_slice(geom.as_bytes());
column.commit_row();
Expand All @@ -356,13 +372,27 @@ impl NestedValues {
reader: &mut Cursor<R>,
) -> Result<()> {
let mut buf = Vec::new();
self.read_string_inner(reader, &mut buf)?;
if reader.read_quoted_text(&mut buf, b'"').is_err()
&& reader.read_quoted_text(&mut buf, b'\'').is_err()
{
let val = self.read_json(reader)?;
buf = val.as_bytes().to_vec();
}
let geog = geography_from_ewkt_bytes(&buf)?;
column.put_slice(geog.as_bytes());
column.commit_row();
Ok(())
}

fn read_json<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<String> {
let start = reader.position() as usize;
let data = reader.get_ref().as_ref();
let mut deserializer = Deserializer::from_slice(&data[start..]);
let raw: Box<RawValue> = Box::<RawValue>::deserialize(&mut deserializer)?;
reader.set_position((start + raw.get().len()) as u64);
Ok(raw.to_string())
}

fn read_nullable<R: AsRef<[u8]>>(
&self,
column: &mut NullableColumnBuilder<AnyType>,
Expand Down
6 changes: 4 additions & 2 deletions src/query/formats/src/field_encoder/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ impl FieldEncoderCSV {
binary_format: params.binary_format,
geometry_format: params.geometry_format,
},
quote_char: 0, // not used
escape_char: 0, // not used
quote_char: 0, // not used
},
string_formatter: StringFormatter::Csv {
quote_char: params.quote.as_bytes()[0],
Expand All @@ -116,7 +117,8 @@ impl FieldEncoderCSV {
binary_format: Default::default(),
geometry_format: Default::default(),
},
quote_char: 0, // not used
escape_char: 0, // not used
quote_char: 0, // not used
},
string_formatter: StringFormatter::Tsv {
record_delimiter: params.field_delimiter.as_bytes().to_vec()[0],
Expand Down
4 changes: 2 additions & 2 deletions src/query/formats/src/field_encoder/helpers/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ static ESCAPE: [u8; 256] = [
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
];

pub fn write_quoted_string(bytes: &[u8], buf: &mut Vec<u8>, quote: u8) {
pub fn write_quoted_string(bytes: &[u8], buf: &mut Vec<u8>, escape: u8, quote: u8) {
let mut start = 0;

for (i, &byte) in bytes.iter().enumerate() {
if byte == quote {
if start < i {
buf.extend_from_slice(&bytes[start..i]);
}
buf.push(quote);
buf.push(escape);
buf.push(quote);
start = i + 1;
}
Expand Down
1 change: 1 addition & 0 deletions src/query/formats/src/field_encoder/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ impl FieldEncoderJSON {
binary_format: Default::default(),
geometry_format: Default::default(),
},
escape_char: 0,
quote_char: 0,
},
quote_denormals: false,
Expand Down
43 changes: 28 additions & 15 deletions src/query/formats/src/field_encoder/values.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ use crate::field_encoder::helpers::write_quoted_string;

pub struct FieldEncoderValues {
pub common_settings: OutputCommonSettings,
pub escape_char: u8,
pub quote_char: u8,
}

Expand All @@ -78,7 +79,8 @@ impl FieldEncoderValues {
binary_format: Default::default(),
geometry_format: Default::default(),
},
quote_char: b'\'',
escape_char: b'"',
quote_char: b'"',
}
}

Expand All @@ -99,7 +101,8 @@ impl FieldEncoderValues {
binary_format: Default::default(),
geometry_format,
},
quote_char: b'\'',
escape_char: b'\\',
quote_char: b'"',
}
}

Expand All @@ -124,7 +127,8 @@ impl FieldEncoderValues {
binary_format: Default::default(),
geometry_format,
},
quote_char: b'\'',
escape_char: b'\\',
quote_char: b'"',
}
}

Expand Down Expand Up @@ -163,7 +167,7 @@ impl FieldEncoderValues {
Column::Timestamp(c) => self.write_timestamp(c, row_index, out_buf, in_nested),
Column::TimestampTz(c) => self.write_timestamp_tz(c, row_index, out_buf, in_nested),
Column::Bitmap(b) => self.write_bitmap(b, row_index, out_buf, in_nested),
Column::Variant(c) => self.write_variant(c, row_index, out_buf, in_nested),
Column::Variant(c) => self.write_variant(c, row_index, out_buf),
Column::Geometry(c) => self.write_geometry(c, row_index, out_buf, in_nested),
Column::Geography(c) => self.write_geography(c, row_index, out_buf, in_nested),

Expand All @@ -186,12 +190,13 @@ impl FieldEncoderValues {
// so we do not expect the scalar literal to be used in sql.
// it is better to keep it simple: minimal escape.
// it make result easier to decode csv, tsv and http handler result.
write_quoted_string(in_buf, out_buf, self.quote_char);
write_quoted_string(in_buf, out_buf, self.escape_char, self.quote_char);
out_buf.push(self.quote_char);
} else {
out_buf.extend_from_slice(in_buf);
}
}

fn write_bool(&self, column: &Bitmap, row_index: usize, out_buf: &mut Vec<u8>) {
let v = if column.get_bit(row_index) {
&self.common_settings().true_bytes
Expand Down Expand Up @@ -328,16 +333,10 @@ impl FieldEncoderValues {
self.write_string_inner(bitmap_result, out_buf, in_nested);
}

fn write_variant(
&self,
column: &BinaryColumn,
row_index: usize,
out_buf: &mut Vec<u8>,
in_nested: bool,
) {
fn write_variant(&self, column: &BinaryColumn, row_index: usize, out_buf: &mut Vec<u8>) {
let v = unsafe { column.index_unchecked(row_index) };
let s = RawJsonb::new(v).to_string();
self.write_string_inner(s.as_bytes(), out_buf, in_nested);
out_buf.extend_from_slice(s.as_bytes());
}

fn write_geometry(
Expand All @@ -360,7 +359,14 @@ impl FieldEncoderValues {
})
.unwrap_or_else(|_| v.to_vec());

self.write_string_inner(&s, out_buf, in_nested);
match self.common_settings().geometry_format {
GeometryDataType::GEOJSON => {
out_buf.extend_from_slice(&s);
}
_ => {
self.write_string_inner(&s, out_buf, in_nested);
}
}
}

fn write_geography(
Expand All @@ -383,7 +389,14 @@ impl FieldEncoderValues {
})
.unwrap_or_else(|_| v.0.to_vec());

self.write_string_inner(&s, out_buf, in_nested);
match self.common_settings().geometry_format {
GeometryDataType::GEOJSON => {
out_buf.extend_from_slice(&s);
}
_ => {
self.write_string_inner(&s, out_buf, in_nested);
}
}
}

fn write_array(&self, column: &ArrayColumn<AnyType>, row_index: usize, out_buf: &mut Vec<u8>) {
Expand Down
3 changes: 2 additions & 1 deletion src/tests/sqlsmith/src/sql_gen/dml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,8 @@ impl<'a, R: Rng + 'a> SqlGenerator<'a, R> {
binary_format: Default::default(),
geometry_format: Default::default(),
},
quote_char: b'\'',
escape_char: b'\\',
quote_char: b'"',
};

for i in 0..row_count {
Expand Down
2 changes: 1 addition & 1 deletion tests/nox/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os


PYTHON_DRIVER = ["0.28.1", "0.28.2"]
PYTHON_DRIVER = ["0.33.1"]


@nox.session
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,9 @@ DROP TABLE t_str
query TTT rowsort
SELECT ( null, to_hour(to_timestamp(3501857592331)), number::Date) from numbers(3) group by all
----
(NULL,18,'1970-01-01')
(NULL,18,'1970-01-02')
(NULL,18,'1970-01-03')
(NULL,18,"1970-01-01")
(NULL,18,"1970-01-02")
(NULL,18,"1970-01-03")

query TTT rowsort
SELECT TRY_CAST('1900-12-30 12:00:00' AS TIMESTAMP) AS "TEMP(Test)(4058757556)(0)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ INSERT INTO t12 (id, arr) VALUES(3, ['1000000-01-01', '2000000-01-01'])
query IT
select * from t12
----
1 ['2021-01-01','2022-01-01']
2 ['1990-12-01','2030-01-12']
1 ["2021-01-01","2022-01-01"]
2 ["1990-12-01","2030-01-12"]

query TT
select arr[1], arr[2] from t12
Expand All @@ -287,8 +287,8 @@ INSERT INTO t13 (id, arr) VALUES(3, ['1000000-01-01 01:01:01', '2000000-01-01 01
query IT
select * from t13
----
1 ['2021-01-01 01:01:01.000000','2022-01-01 01:01:01.000000']
2 ['1990-12-01 10:11:12.000000','2030-01-12 22:00:00.000000']
1 ["2021-01-01 01:01:01.000000","2022-01-01 01:01:01.000000"]
2 ["1990-12-01 10:11:12.000000","2030-01-12 22:00:00.000000"]

query TT
select arr[1], arr[2] from t13
Expand All @@ -308,8 +308,8 @@ INSERT INTO t14 (id, arr) VALUES(1, ['aa', 'bb']), (2, ['cc', 'dd'])
query IT
select * from t14
----
1 ['aa','bb']
2 ['cc','dd']
1 ["aa","bb"]
2 ["cc","dd"]

query TT
select arr[1], arr[2] from t14
Expand All @@ -329,9 +329,9 @@ INSERT INTO t15 (id, arr) VALUES(1, ['aa', 'bb']), (2, ['cc', 'dd']), (3, [12, 3
query IT
select * from t15
----
1 ['aa','bb']
2 ['cc','dd']
3 ['12','34']
1 ["aa","bb"]
2 ["cc","dd"]
3 ["12","34"]

query TT
select arr[1], arr[2] from t15
Expand Down Expand Up @@ -380,10 +380,10 @@ INSERT INTO t17 (id, arr) VALUES(1, ['aa', 'bb']), (2, [null, 'cc']), (3, ['dd',
query IT
select * from t17
----
1 ['aa','bb']
2 [NULL,'cc']
3 ['dd',NULL]
4 ['ee','ff']
1 ["aa","bb"]
2 [NULL,"cc"]
3 ["dd",NULL]
4 ["ee","ff"]

query TT
select arr[1], arr[2] from t17
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ INSERT INTO t1 (id, t) VALUES(1, (true, 100, 12.34, 'abc', '2020-01-01', '2020-0
query IT
select * from t1
----
1 (1,100,12.34,'abc','2020-01-01','2020-01-01 00:00:00.000000')
2 (0,200,-25.73,'xyz','2022-06-01','2022-06-01 12:00:00.000000')
1 (1,100,12.34,"abc","2020-01-01","2020-01-01 00:00:00.000000")
2 (0,200,-25.73,"xyz","2022-06-01","2022-06-01 12:00:00.000000")

query BIFTTT
select t.1, t.2, t.3, t.4, t.5, t.6 from t1
Expand Down Expand Up @@ -46,8 +46,8 @@ INSERT INTO t2 (id, t) VALUES(1, (true, 10, 0.5, 'x', '2021-05-01', '2021-05-01
query IT
select * from t2
----
1 (1,10,0.5,'x','2021-05-01','2021-05-01 00:00:00.000000')
2 (0,-10,-0.9,'y','2022-10-01','2022-10-01 12:00:00.000000')
1 (1,10,0.5,"x","2021-05-01","2021-05-01 00:00:00.000000")
2 (0,-10,-0.9,"y","2022-10-01","2022-10-01 12:00:00.000000")

query BIFTTT
select t:a, t:b, t:c, t:d, t:e, t:f from t2
Expand Down
Loading
Loading