-
Notifications
You must be signed in to change notification settings - Fork 2
Support for Archives(zip,tar) #34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
75badd8
ff59ec8
0a6eeca
a52ae4f
e735358
1e654d2
073193a
9e0cf2b
7a94408
7c136db
8496ea5
297abdb
c3a74fa
829ac05
018786a
309c1dc
5b8246a
972476b
f3d603c
d6d4331
8430e13
782ab23
e2370c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| # Archive Task | ||
|
|
||
| The `archive` task packages or extracts files using various archive formats, enabling efficient file bundling and extraction for data processing pipelines. | ||
|
|
||
| ## Function | ||
|
|
||
| The archive task can operate in two modes: | ||
| - **Pack mode**: Combines multiple files into a single archive | ||
| - **Unpack mode**: Extracts files from an archive | ||
|
|
||
| ## Behavior | ||
|
|
||
| The archive task processes data based on the `action` field: | ||
| - **Pack**: Receives individual files and creates an archive file containing them | ||
| - **Unpack**: Receives an archive file and extracts its contents, outputting each file individually | ||
|
|
||
| The task receives records from its input channel, applies the archiving operation, and sends the processed records to its output channel. | ||
|
|
||
| ## Configuration Fields | ||
|
|
||
| | Field | Type | Default | Description | | ||
| |-------|------|---------|-------------| | ||
| | `name` | string | - | Task name for identification | | ||
| | `type` | string | `archive` | Must be "archive" | | ||
| | `format` | string | `zip` | Archive format (zip, tar) | | ||
| | `action` | string | `pack` | Action type (pack or unpack) | | ||
|
|
||
| ## Supported Formats | ||
|
|
||
| The task supports the following archive formats: | ||
| - **zip**: Standard ZIP format, widely compatible | ||
| - **tar**: TAR format, commonly used in Unix/Linux environments | ||
|
|
||
| ## Example Configurations | ||
|
|
||
| ### Pack files into a ZIP archive: | ||
| ```yaml | ||
| tasks: | ||
| - name: create_zip | ||
| type: archive | ||
| format: zip | ||
| action: pack | ||
| ``` | ||
|
|
||
| ### Unpack a ZIP archive: | ||
| ```yaml | ||
| tasks: | ||
| - name: extract_zip | ||
| type: archive | ||
| format: zip | ||
| action: unpack | ||
| ``` | ||
|
|
||
| ### Pack files into a TAR archive: | ||
| ```yaml | ||
| tasks: | ||
| - name: create_tar | ||
| type: archive | ||
| format: tar | ||
| action: pack | ||
| ``` | ||
|
|
||
| ### Unpack a TAR archive: | ||
| ```yaml | ||
| tasks: | ||
| - name: extract_tar | ||
| type: archive | ||
| format: tar | ||
| action: unpack | ||
| ``` | ||
|
|
||
| ## Sample Pipelines | ||
|
|
||
| - `test/pipelines/zip_pack_test.yaml` - ZIP packing example | ||
| - `test/pipelines/zip_unpack_test.yaml` - ZIP unpacking example | ||
| - `test/pipelines/tar_unpack_multifile_test.yaml` - TAR unpacking with multiple files | ||
|
|
||
| ## Use Cases | ||
|
|
||
| - **File bundling**: Package multiple files into a single archive for distribution | ||
| - **Data consolidation**: Combine separate data files into archives for storage | ||
| - **Archive extraction**: Extract files from archives for processing | ||
| - **Backup operations**: Create archives of processed data for backup | ||
| - **Format conversion**: Convert between archive formats | ||
| - **Multi-file handling**: Process multiple files as a single archive unit |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| package archive | ||
|
|
||
| import ( | ||
| "fmt" | ||
|
|
||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/record" | ||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/task" | ||
| ) | ||
|
|
||
| type actionType string | ||
|
|
||
| const ( | ||
| actionPack actionType = `pack` | ||
| actionUnpack actionType = `unpack` | ||
| ) | ||
|
|
||
| const ( | ||
| defaultFormat = `zip` | ||
| defaultAction = `pack` | ||
| ) | ||
|
|
||
| type archiver interface { | ||
| Read() | ||
| Write() | ||
| } | ||
|
|
||
| type core struct { | ||
| task.Base `yaml:",inline" json:",inline"` | ||
| Format string `yaml:"format,omitempty" json:"format,omitempty"` | ||
| Action actionType `yaml:"action,omitempty" json:"action,omitempty"` | ||
| } | ||
|
|
||
| func New() (task.Task, error) { | ||
| return &core{ | ||
| Format: defaultFormat, | ||
| Action: defaultAction, | ||
| }, nil | ||
| } | ||
|
|
||
| func (c *core) UnmarshalYAML(unmarshal func(interface{}) error) error { | ||
| type raw core | ||
| obj := raw{ | ||
| Format: defaultFormat, | ||
| Action: defaultAction, | ||
| } | ||
| if err := unmarshal(&obj); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if obj.Action != actionPack && obj.Action != actionUnpack { | ||
| return fmt.Errorf("invalid action: %s (must be 'pack' or 'unpack')", obj.Action) | ||
| } | ||
|
|
||
| *c = core(obj) | ||
|
|
||
| return nil | ||
| } | ||
|
Comment on lines
+67
to
+84
|
||
|
|
||
| func (c *core) Run(input <-chan *record.Record, output chan<- *record.Record) (err error) { | ||
|
|
||
| if input == nil { | ||
| return task.ErrNilInput | ||
| } | ||
|
|
||
| var archiv archiver | ||
|
|
||
| switch c.Format { | ||
ma-gk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| case "tar": | ||
| archiv = &tarArchive{ | ||
| Base: &c.Base, | ||
| OutputChan: output, | ||
| InputChan: input, | ||
| } | ||
| case "zip": | ||
| archiv = &zipArchive{ | ||
| Base: &c.Base, | ||
| OutputChan: output, | ||
| InputChan: input, | ||
| } | ||
| default: | ||
| return fmt.Errorf("unsupported format: %s", c.Format) | ||
| } | ||
|
|
||
| switch c.Action { | ||
| case actionPack: | ||
| archiv.Write() | ||
| case actionUnpack: | ||
| archiv.Read() | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
Comment on lines
+1
to
+103
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| package archive | ||
|
|
||
| import ( | ||
| "archive/tar" | ||
| "bytes" | ||
| "io" | ||
| "log" | ||
| "strings" | ||
|
|
||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/record" | ||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/task" | ||
| ) | ||
|
|
||
| type tarArchive struct { | ||
| *task.Base | ||
| OutputChan chan<- *record.Record | ||
| InputChan <-chan *record.Record | ||
| } | ||
|
|
||
| func (t *tarArchive) Read() { | ||
|
|
||
| for { | ||
| rc, ok := t.GetRecord(t.InputChan) | ||
| if !ok { | ||
| break | ||
| } | ||
|
|
||
| if len(rc.Data) == 0 { | ||
| continue | ||
| } | ||
|
|
||
| b := rc.Data | ||
|
|
||
| r := tar.NewReader(bytes.NewReader(b)) | ||
|
|
||
| for { | ||
| header, err := r.Next() | ||
| if err == io.EOF { | ||
| break | ||
| } | ||
| if err != nil { | ||
| log.Fatal(err) | ||
|
||
| } | ||
|
|
||
| // check the file type is regular file | ||
| if header.Typeflag == tar.TypeReg { | ||
| buf := make([]byte, header.Size) | ||
| if _, err := io.ReadFull(r, buf); err != nil && err != io.EOF { | ||
| log.Fatal(err) | ||
| } | ||
| rc.SetContextValue("CATERPILLER_FILE_PATH_READ", header.Name) | ||
ma-gk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| t.SendData(rc.Context, buf, t.OutputChan) | ||
| } | ||
|
|
||
| } | ||
| } | ||
| } | ||
|
|
||
| func (t *tarArchive) Write() { | ||
|
|
||
| var buf bytes.Buffer | ||
| tw := tar.NewWriter(&buf) | ||
| var rc record.Record | ||
|
|
||
| for { | ||
| rec, ok := t.GetRecord(t.InputChan) | ||
| if !ok { | ||
| break | ||
| } | ||
| b := rec.Data | ||
|
|
||
| if len(b) == 0 { | ||
| continue | ||
| } | ||
|
|
||
| filePath, found := rec.GetContextValue("CATERPILLER_FILE_PATH") | ||
| if !found { | ||
| log.Fatal("filepath not set in context") | ||
| } | ||
|
|
||
| if filePath == "" { | ||
| log.Fatal("file_name is required when filepath is not in context") | ||
| } | ||
|
|
||
| filePath = strings.ReplaceAll(filePath, "\\", "/") | ||
|
|
||
| header := &tar.Header{ | ||
| Name: filePath, | ||
| Mode: 0600, | ||
| Size: int64(len(b)), | ||
| } | ||
| if err := tw.WriteHeader(header); err != nil { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| if _, err := tw.Write(b); err != nil { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| rc.Context = rec.Context | ||
| } | ||
|
|
||
| if err := tw.Close(); err != nil { | ||
| log.Fatal(err) | ||
ma-gk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| t.SendData(rc.Context, buf.Bytes(), t.OutputChan) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| package archive | ||
|
|
||
| import ( | ||
| "archive/zip" | ||
| "bytes" | ||
| "io" | ||
| "log" | ||
| "strings" | ||
|
|
||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/record" | ||
| "github.com/patterninc/caterpillar/internal/pkg/pipeline/task" | ||
| ) | ||
|
|
||
| type zipArchive struct { | ||
| *task.Base | ||
| OutputChan chan<- *record.Record | ||
| InputChan <-chan *record.Record | ||
| } | ||
|
|
||
| func (z *zipArchive) Read() { | ||
| for { | ||
| rc, ok := z.GetRecord(z.InputChan) | ||
| if !ok { | ||
| break | ||
| } | ||
|
|
||
| if len(rc.Data) == 0 { | ||
| continue | ||
| } | ||
|
|
||
| b := rc.Data | ||
|
|
||
| r, err := zip.NewReader(bytes.NewReader(b), int64(len(b))) | ||
| if err != nil { | ||
| log.Fatal(err) | ||
| } | ||
| for _, f := range r.File { | ||
|
|
||
| // check the file type is regular file | ||
| if f.FileInfo().Mode().IsRegular() { | ||
|
|
||
| rc.SetContextValue("CATERPILLER_FILE_PATH_READ", f.Name) | ||
|
|
||
| fs, err := f.Open() | ||
| if err != nil { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| buf := make([]byte, f.FileInfo().Size()) | ||
|
|
||
| _, err = fs.Read(buf) | ||
| if err != nil && err != io.EOF { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| fs.Close() | ||
|
|
||
| z.SendData(rc.Context, buf, z.OutputChan) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (z *zipArchive) Write() { | ||
|
|
||
| zipBuf := new(bytes.Buffer) | ||
| zipWriter := zip.NewWriter(zipBuf) | ||
| var rc record.Record | ||
|
|
||
| for { | ||
| rec, ok := z.GetRecord(z.InputChan) | ||
| if !ok { | ||
| break | ||
| } | ||
|
|
||
| filePath, found := rec.GetContextValue("CATERPILLER_FILE_PATH") | ||
| if !found { | ||
| log.Fatal("filepath not set in context") | ||
| } | ||
|
|
||
| if filePath == "" { | ||
| log.Fatal("file_name is required when filepath is not in context") | ||
| } | ||
|
|
||
| filePath = strings.ReplaceAll(filePath, "\\", "/") | ||
|
|
||
| w, err := zipWriter.Create(filePath) | ||
| if err != nil { | ||
| log.Fatal(err) | ||
| } | ||
| _, err = w.Write(rec.Data) | ||
| if err != nil { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| rc.Context = rec.Context | ||
| } | ||
|
|
||
| if err := zipWriter.Close(); err != nil { | ||
| log.Fatal(err) | ||
| } | ||
|
|
||
| // Send the complete ZIP archive | ||
| z.SendData(rc.Context, zipBuf.Bytes(), z.OutputChan) | ||
|
|
||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The archiver interface methods Read and Write don't return errors, which prevents proper error handling. These methods should return error values so that errors can be propagated back through the Run method to the caller. Compare with the compress task where compress/decompress methods return errors.