Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions internal/pkg/pipeline/task/file/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,7 @@ func (f *file) readFile(output chan<- *record.Record) error {

for _, path := range paths {

readerCloser, err := reader.read(path)
if err != nil {
return err
}
defer readerCloser.Close()

content, err := io.ReadAll(readerCloser)
content, err := readFileContent(reader, path)
if err != nil {
return err
}
Expand All @@ -147,6 +141,23 @@ func (f *file) readFile(output chan<- *record.Record) error {

}

// helper function to read file and close reader after reading
// we need this to close reader in a loop without delay
func readFileContent(reader reader, path string) ([]byte, error) {
readerCloser, err := reader.read(path)
if err != nil {
return nil, err
}
defer readerCloser.Close()

content, err := io.ReadAll(readerCloser)
if err != nil {
return nil, err
}

return content, nil
}

func (f *file) writeFile(input <-chan *record.Record) error {

for {
Expand Down
65 changes: 39 additions & 26 deletions internal/pkg/pipeline/task/http/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"io"
"net/http"
"strings"
"sync"
"time"

"github.com/patterninc/caterpillar/internal/pkg/config"
Expand All @@ -27,7 +28,11 @@ const (
)

var (
ctx = context.Background()
byteBufferPool = sync.Pool{
New: func() any {
return new(bytes.Buffer)
},
}
)

type oauth struct {
Expand Down Expand Up @@ -63,7 +68,7 @@ type httpCore struct {
}

type result struct {
Data string `json:"data"`
Data []byte `json:"data"`
Headers map[string][]string `json:"headers"`
}

Expand Down Expand Up @@ -184,7 +189,7 @@ func (h *httpCore) processItem(rc *record.Record, output chan<- *record.Record)
rc.SetMetaValue(contextKey, strings.Join(headerValues, "; "))
}

h.SendData(rc.Meta, []byte(result.Data), output)
h.SendData(rc.Meta, result.Data, output)
}

// if we do not have a way to define the next page, we bail...
Expand All @@ -199,13 +204,24 @@ func (h *httpCore) processItem(rc *record.Record, output chan<- *record.Record)
if err != nil {
return err
}
nextPageInput, err := json.Marshal(result)
if err != nil {

// Use pooled buffer for temporary JSON encoding
buf := byteBufferPool.Get().(*bytes.Buffer)
buf.Reset()
if err := json.NewEncoder(buf).Encode(result); err != nil {
byteBufferPool.Put(buf)
return err
}
nextPageInput := buf.Bytes()
// trim trailing newline added by encoder
if len(nextPageInput) > 0 && nextPageInput[len(nextPageInput)-1] == '\n' {
nextPageInput = nextPageInput[:len(nextPageInput)-1]
}

nextPageData, err := nextPage.Execute(nextPageInput, map[string]any{
`page_id`: pageID,
})
byteBufferPool.Put(buf) // Return buffer after use

if err != nil {
return err
Expand Down Expand Up @@ -256,6 +272,21 @@ func (h *httpCore) processItem(rc *record.Record, output chan<- *record.Record)

func (h *httpCore) call(endpoint string) (*result, error) {

// Create HTTP client once, reuse for all retries to enable connection pooling
client := &http.Client{
Timeout: time.Duration(h.Timeout),
}

// Do we use proxy for this one?
if h.Proxy != nil {
transport, err := h.Proxy.getTransport()
if err != nil {
fmt.Printf("error configuring proxy: %s\n", err)
return nil, err
}
client.Transport = transport
}

var lastErr error
for attempt := 1; attempt <= h.MaxRetries; attempt++ {

Expand Down Expand Up @@ -292,24 +323,6 @@ func (h *httpCore) call(endpoint string) (*result, error) {
}
}

// Create HTTP client with proxy configuration if specified
client := &http.Client{
Timeout: time.Duration(h.Timeout),
}

// Do we use proxy for this one?
if h.Proxy != nil {
transport, err := h.Proxy.getTransport()
if err != nil {
lastErr = err
if attempt < h.MaxRetries {
continue
}
break
}
client.Transport = transport
}

response, err := client.Do(request)
if err != nil {
lastErr = err
Expand All @@ -320,9 +333,9 @@ func (h *httpCore) call(endpoint string) (*result, error) {
break
}

defer response.Body.Close()

// Read body and close immediately
body, err := io.ReadAll(response.Body)
response.Body.Close()
if err != nil {
lastErr = err
if attempt < h.MaxRetries {
Expand All @@ -343,7 +356,7 @@ func (h *httpCore) call(endpoint string) (*result, error) {
}

return &result{
Data: string(body),
Data: body,
Headers: response.Header,
}, nil
}
Expand Down
5 changes: 5 additions & 0 deletions internal/pkg/pipeline/task/xpath/xpath.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ func (x *xpath) Run(ctx context.Context, input <-chan *record.Record, output cha
x.SendData(r.Meta, data, output)
}
}

// Release HTML data reference to allow GC of the large response body.
// The parsed html.Node tree holds pointers into r.Data, but after extraction
// we no longer need the original bytes.
r.Data = nil
}

return nil
Expand Down
Loading