@@ -21,9 +21,8 @@ use arrow::array::types::Int32Type;
21
21
use arrow:: array:: { make_array, Array , AsArray , DictionaryArray } ;
22
22
use arrow:: array:: { ArrayRef , OffsetSizeTrait } ;
23
23
use arrow:: datatypes:: DataType ;
24
- use datafusion:: common:: { cast:: as_generic_string_array, DataFusionError , HashMap , ScalarValue } ;
24
+ use datafusion:: common:: { cast:: as_generic_string_array, DataFusionError , ScalarValue } ;
25
25
use datafusion:: physical_plan:: ColumnarValue ;
26
- use std:: fmt:: Write ;
27
26
use std:: sync:: Arc ;
28
27
29
28
/// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
@@ -115,53 +114,26 @@ fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
115
114
match rpad_argument {
116
115
RPadArgument :: ColArray ( array_int) => {
117
116
let int_pad_array = array_int. as_primitive :: < Int32Type > ( ) ;
118
- let mut str_pad_value_map = HashMap :: new ( ) ;
119
- for i in 0 ..string_array. len ( ) {
120
- if string_array. is_null ( i) || int_pad_array. is_null ( i) {
121
- continue ; // skip nulls
122
- }
123
- str_pad_value_map. insert ( string_array. value ( i) , int_pad_array. value ( i) ) ;
124
- }
125
117
126
118
let mut builder = GenericStringBuilder :: < T > :: with_capacity (
127
- str_pad_value_map . len ( ) ,
128
- str_pad_value_map . len ( ) * int_pad_array. len ( ) ,
119
+ string_array . len ( ) ,
120
+ string_array . len ( ) * int_pad_array. len ( ) ,
129
121
) ;
130
122
131
- for string in string_array. iter ( ) {
123
+ for ( string, length ) in string_array. iter ( ) . zip ( int_pad_array ) {
132
124
match string {
133
- Some ( string) => {
134
- // It looks Spark's UTF8String is closer to chars rather than graphemes
135
- // https://stackoverflow.com/a/46290728
136
- let char_len = string. chars ( ) . count ( ) ;
137
- let length: usize = 0 . max ( * str_pad_value_map. get ( string) . unwrap ( ) ) as usize ;
138
- let space_string = " " . repeat ( length) ;
139
- if length <= char_len {
140
- if truncate {
141
- let idx = string
142
- . char_indices ( )
143
- . nth ( length)
144
- . map ( |( i, _) | i)
145
- . unwrap_or ( string. len ( ) ) ;
146
- builder. append_value ( & string[ ..idx] ) ;
147
- } else {
148
- builder. append_value ( string) ;
149
- }
150
- } else {
151
- // write_str updates only the value buffer, not null nor offset buffer
152
- // This is convenient for concatenating str(s)
153
- builder. write_str ( string) ?;
154
- builder. append_value ( & space_string[ char_len..] ) ;
155
- }
156
- }
125
+ Some ( string) => builder. append_value ( add_padding_string (
126
+ string. parse ( ) . unwrap ( ) ,
127
+ length. unwrap ( ) as usize ,
128
+ truncate,
129
+ ) ) ,
157
130
_ => builder. append_null ( ) ,
158
131
}
159
132
}
160
133
Ok ( ColumnarValue :: Array ( Arc :: new ( builder. finish ( ) ) ) )
161
134
}
162
135
RPadArgument :: ConstLength ( length) => {
163
136
let length = 0 . max ( length) as usize ;
164
- let space_string = " " . repeat ( length) ;
165
137
166
138
let mut builder = GenericStringBuilder :: < T > :: with_capacity (
167
139
string_array. len ( ) ,
@@ -170,32 +142,36 @@ fn spark_read_side_padding_internal<T: OffsetSizeTrait>(
170
142
171
143
for string in string_array. iter ( ) {
172
144
match string {
173
- Some ( string) => {
174
- // It looks Spark's UTF8String is closer to chars rather than graphemes
175
- // https://stackoverflow.com/a/46290728
176
- let char_len = string. chars ( ) . count ( ) ;
177
- if length <= char_len {
178
- if truncate {
179
- let idx = string
180
- . char_indices ( )
181
- . nth ( length)
182
- . map ( |( i, _) | i)
183
- . unwrap_or ( string. len ( ) ) ;
184
- builder. append_value ( & string[ ..idx] ) ;
185
- } else {
186
- builder. append_value ( string) ;
187
- }
188
- } else {
189
- // write_str updates only the value buffer, not null nor offset buffer
190
- // This is convenient for concatenating str(s)
191
- builder. write_str ( string) ?;
192
- builder. append_value ( & space_string[ char_len..] ) ;
193
- }
194
- }
145
+ Some ( string) => builder. append_value ( add_padding_string (
146
+ string. parse ( ) . unwrap ( ) ,
147
+ length,
148
+ truncate,
149
+ ) ) ,
195
150
_ => builder. append_null ( ) ,
196
151
}
197
152
}
198
153
Ok ( ColumnarValue :: Array ( Arc :: new ( builder. finish ( ) ) ) )
199
154
}
200
155
}
201
156
}
157
+
158
+ fn add_padding_string ( string : String , length : usize , truncate : bool ) -> String {
159
+ // It looks Spark's UTF8String is closer to chars rather than graphemes
160
+ // https://stackoverflow.com/a/46290728
161
+ let space_string = " " . repeat ( length) ;
162
+ let char_len = string. chars ( ) . count ( ) ;
163
+ if length <= char_len {
164
+ if truncate {
165
+ let idx = string
166
+ . char_indices ( )
167
+ . nth ( length)
168
+ . map ( |( i, _) | i)
169
+ . unwrap_or ( string. len ( ) ) ;
170
+ string[ ..idx] . parse ( ) . unwrap ( )
171
+ } else {
172
+ string
173
+ }
174
+ } else {
175
+ string + & space_string[ char_len..]
176
+ }
177
+ }
0 commit comments