diff --git a/internal/pkg/pipeline/task/converter/README.md b/internal/pkg/pipeline/task/converter/README.md index 0a0cefe..57ff96d 100644 --- a/internal/pkg/pipeline/task/converter/README.md +++ b/internal/pkg/pipeline/task/converter/README.md @@ -47,6 +47,8 @@ Convert a single line to the SSTable which could be stored on s3 or via file. It | Field | Type | Default | Description | |-------|------|---------|-------------| | `sheets` | array | all sheets | Optional array of sheet names to process. If not specified, all sheets are processed | +| `skip_rows` | int | `0` | Number of rows to skip from the beginning of each sheet (e.g., for header rows) | +| `skip_rows_by_sheet` | map[string]int | - | Per-sheet row skip overrides. Keys are sheet names, values are rows to skip. Takes precedence over `skip_rows` | **Important:** The XLSX converter emits **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`. @@ -113,6 +115,21 @@ tasks: # Each record will have xlsx_sheet_name in context ``` +### Excel to CSV with row skipping: +```yaml +tasks: + - name: read_excel + type: file + path: report.xlsx + - name: convert_excel + type: converter + format: xlsx + skip_rows: 1 # Skip header row on all sheets + skip_rows_by_sheet: + Summary: 3 # Skip 3 rows on Summary sheet (overrides skip_rows) + RawData: 0 # Don't skip any rows on RawData sheet +``` + ## Sample Pipelines - `test/pipelines/convert_file.yaml` - File format conversion diff --git a/internal/pkg/pipeline/task/converter/xlsx.go b/internal/pkg/pipeline/task/converter/xlsx.go index f641478..bd372f1 100644 --- a/internal/pkg/pipeline/task/converter/xlsx.go +++ b/internal/pkg/pipeline/task/converter/xlsx.go @@ -13,7 +13,9 @@ const ( ) type xlsx struct { - Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"` + Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"` + SkipRows int `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"` + SkipRowsBySheet map[string]int `yaml:"skip_rows_by_sheet,omitempty" json:"skip_rows_by_sheet,omitempty"` } func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) { @@ -37,7 +39,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) { outputs := make([]converterOutput, 0, len(sheets)) for _, sheet := range sheets { - output, err := readSheet(reader, sheet) + output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet)) if err != nil { return nil, err } @@ -48,7 +50,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) { return outputs, nil } -func readSheet(reader *excelize.File, sheet string) (converterOutput, error) { +func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOutput, error) { // Create buffer for this sheet var buff bytes.Buffer writer := csvEncoder.NewWriter(&buff) @@ -61,7 +63,13 @@ func readSheet(reader *excelize.File, sheet string) (converterOutput, error) { defer rows.Close() // Write rows to buffer + i := 0 for rows.Next() { + if i < rowsToSkip { + i++ + continue + } + cols, err := rows.Columns() if err != nil { return converterOutput{}, err @@ -82,3 +90,18 @@ func readSheet(reader *excelize.File, sheet string) (converterOutput, error) { }, }, nil } + +func (x *xlsx) getRowsToSkip(sheet string) int { + rowsToSkip := x.SkipRows + if x.SkipRowsBySheet != nil { + if val, found := x.SkipRowsBySheet[sheet]; found { + rowsToSkip = val + } + } + + if rowsToSkip < 0 { + rowsToSkip = 0 + } + + return rowsToSkip +} diff --git a/test/pipelines/converter/convert_xls.yaml b/test/pipelines/converter/convert_xls.yaml index 48a1949..90b44de 100644 --- a/test/pipelines/converter/convert_xls.yaml +++ b/test/pipelines/converter/convert_xls.yaml @@ -6,6 +6,7 @@ tasks: type: converter format: xlsx sheets: ["Sheet1"] + skip_rows: 2 - name: write_csv type: file path: test-result.csv diff --git a/test/pipelines/converter/convert_xls_with_skip_rows_by_sheets.yaml b/test/pipelines/converter/convert_xls_with_skip_rows_by_sheets.yaml new file mode 100644 index 0000000..7108623 --- /dev/null +++ b/test/pipelines/converter/convert_xls_with_skip_rows_by_sheets.yaml @@ -0,0 +1,14 @@ +tasks: + - name: read + type: file + path: ./test/pipelines/random_data.xlsx + - name: convert_from_xls + type: converter + format: xlsx + skip_rows_by_sheet: + Sheet1: 1 + Sheet2: 2 + Sheet3: 3 + - name: write_csv + type: file + path: test-result_{{ macro "uuid" }}.csv