Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions internal/pkg/pipeline/task/converter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Convert a single line to the SSTable which could be stored on s3 or via file. It
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `sheets` | array | all sheets | Optional array of sheet names to process. If not specified, all sheets are processed |
| `skip_rows` | int | `0` | Number of rows to skip from the beginning of each sheet (e.g., for header rows) |
| `skip_rows_by_sheet` | map[string]int | - | Per-sheet row skip overrides. Keys are sheet names, values are rows to skip. Takes precedence over `skip_rows` |

**Important:** The XLSX converter emits **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`.

Expand Down Expand Up @@ -113,6 +115,21 @@ tasks:
# Each record will have xlsx_sheet_name in context
```

### Excel to CSV with row skipping:
```yaml
tasks:
- name: read_excel
type: file
path: report.xlsx
- name: convert_excel
type: converter
format: xlsx
skip_rows: 1 # Skip header row on all sheets
skip_rows_by_sheet:
Summary: 3 # Skip 3 rows on Summary sheet (overrides skip_rows)
RawData: 0 # Don't skip any rows on RawData sheet
```

## Sample Pipelines

- `test/pipelines/convert_file.yaml` - File format conversion
Expand Down
24 changes: 21 additions & 3 deletions internal/pkg/pipeline/task/converter/xlsx.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ const (
)

type xlsx struct {
Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"`
Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"`
SkipRows int `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"`
SkipRowsBySheet map[string]int `yaml:"skip_rows_by_sheet,omitempty" json:"skip_rows_by_sheet,omitempty"`
}

func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
Expand All @@ -37,7 +39,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
outputs := make([]converterOutput, 0, len(sheets))

for _, sheet := range sheets {
output, err := readSheet(reader, sheet)
output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet))
if err != nil {
return nil, err
}
Expand All @@ -48,7 +50,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
return outputs, nil
}

func readSheet(reader *excelize.File, sheet string) (converterOutput, error) {
func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOutput, error) {
// Create buffer for this sheet
var buff bytes.Buffer
writer := csvEncoder.NewWriter(&buff)
Expand All @@ -61,7 +63,13 @@ func readSheet(reader *excelize.File, sheet string) (converterOutput, error) {
defer rows.Close()

// Write rows to buffer
i := 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use a better naming convention over here?
rowIndex

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's idiomatic way in go to use short names for variable having small scope, use of single letter variable is a common practice for index.

for rows.Next() {
if i < rowsToSkip {
i++
continue
}

cols, err := rows.Columns()
if err != nil {
return converterOutput{}, err
Expand All @@ -82,3 +90,13 @@ func readSheet(reader *excelize.File, sheet string) (converterOutput, error) {
},
}, nil
}

func (x *xlsx) getRowsToSkip(sheet string) int {
if x.SkipRowsBySheet != nil {
if val, found := x.SkipRowsBySheet[sheet]; found {
return val
}
}

return x.SkipRows
}
1 change: 1 addition & 0 deletions test/pipelines/converter/convert_xls.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ tasks:
type: converter
format: xlsx
sheets: ["Sheet1"]
skip_rows: 2
- name: write_csv
type: file
path: test-result.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
tasks:
- name: read
type: file
path: ./test/pipelines/random_data.xlsx
- name: convert_from_xls
type: converter
format: xlsx
skip_rows_by_sheet:
Sheet1: 1
Sheet2: 2
Sheet3: 3
- name: write_csv
type: file
path: test-result_{{ macro "uuid" }}.csv
Loading