Skip to content

Commit

Permalink
fix next page support
Browse files Browse the repository at this point in the history
  • Loading branch information
bjesus committed Sep 6, 2024
1 parent 5bc8666 commit 9a6db7c
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 56 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.vscode
specs
personal
/erol
/erol
/aur
2 changes: 1 addition & 1 deletion common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package common

type Block struct {
Type string
Command string
Command interface{}
Queries []string
NextPage string
}
Expand Down
68 changes: 29 additions & 39 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ import (
"bufio"
"fmt"
"log"
"net/url"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/bjesus/pipet/common"
"github.com/bjesus/pipet/parsers"
"github.com/tidwall/gjson"
"github.com/google/shlex"
)

func ParseSpecFile(e *common.PipetApp, filename string) error {
Expand Down Expand Up @@ -48,7 +48,7 @@ func ParseSpecFile(e *common.PipetApp, filename string) error {
} else {
if strings.HasPrefix(line, "> ") {

currentBlock.NextPage = strings.TrimPrefix(line, "> ")
currentBlock.NextPage = strings.TrimPrefix(line, ">")
} else {
currentBlock.Queries = append(currentBlock.Queries, line)
}
Expand All @@ -67,10 +67,11 @@ func ExecuteBlocks(e *common.PipetApp) error {
for _, block := range e.Blocks {
var data interface{}
var err error
var nextPageURL string

for page := 0; page < e.MaxPages; page++ {
if block.Type == "curl" {
data, err = parsers.ExecuteCurlBlock(block)
data, nextPageURL, err = parsers.ExecuteCurlBlock(block)
} else if block.Type == "playwright" {
data, err = parsers.ExecutePlaywrightBlock(block)
}
Expand All @@ -81,52 +82,41 @@ func ExecuteBlocks(e *common.PipetApp) error {

e.Data = append(e.Data, data)

if block.NextPage == "" {
break
var parts []string
switch cmd := block.Command.(type) {
case string:
parts, _ = shlex.Split(cmd)
case []string:
parts = cmd
default:
}

nextURL, err := getNextPageURL(block, data)
if err != nil {
return err
for i, u := range parts {
if len(u) >= 4 && u[:4] == "http" {
parts[i] = concatenateURLs(parts[i], nextPageURL)
break
}
}

block.Command = strings.Replace(block.Command, block.Command[strings.Index(block.Command, " ")+1:], nextURL, 1)
block.Command = parts
}
}

return nil
}

func getNextPageURL(block common.Block, data interface{}) (string, error) {
parts := strings.Split(block.NextPage, "|")
selector := strings.TrimSpace(parts[0])

var nextURL string

if block.Type == "curl" {
if strings.HasPrefix(selector, ".") {
// JSON mode
nextURL = gjson.Get(fmt.Sprintf("%v", data), selector).String()
} else {
// HTML mode
doc, err := goquery.NewDocumentFromReader(strings.NewReader(fmt.Sprintf("%v", data)))
if err != nil {
return "", err
}
nextURL, _ = doc.Find(selector).Attr("href")
}
} else if block.Type == "playwright" {
// TODO: Implement Playwright next page logic
return "", fmt.Errorf("playwright next page not implemented")
func concatenateURLs(base, ref string) string {
baseURL, err := url.Parse(base)
if err != nil {
panic(err)
}

if len(parts) > 1 {
pipedURL, err := parsers.ExecutePipe(nextURL, strings.TrimSpace(parts[1]))
if err != nil {
return "", err
}
nextURL = strings.TrimSpace(pipedURL)
refURL, err := url.Parse(ref)
if err != nil {
panic(err)
}

return nextURL, nil
// Resolve reference URL relative to the base URL
fullURL := baseURL.ResolveReference(refURL)

return fullURL.String()
}
13 changes: 9 additions & 4 deletions parsers/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ import (
"github.com/PuerkitoBio/goquery"
)

func ParseHTMLQueries(htmlData []byte, queries []string) (interface{}, error) {
func ParseHTMLQueries(htmlData []byte, queries []string, nextPage string) (interface{}, string, error) {

result := []interface{}{}

// get new HTML
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlData)))
if err != nil {
return nil, err
return nil, "", err
}

for i, line := range queries {
Expand All @@ -39,7 +39,7 @@ func ParseHTMLQueries(htmlData []byte, queries []string) (interface{}, error) {
elements.Each(func(subi int, subdoc *goquery.Selection) {
html, _ := goquery.OuterHtml(subdoc)

value2, _ := ParseHTMLQueries([]byte(html), lines)
value2, _, _ := ParseHTMLQueries([]byte(html), lines, "")
subresult = append(subresult, value2)
})

Expand Down Expand Up @@ -69,5 +69,10 @@ func ParseHTMLQueries(htmlData []byte, queries []string) (interface{}, error) {
}

}
return result, nil

nextPageURL := ""
if nextPage != "" {
nextPageURL = doc.Find(nextPage).First().AttrOr("href", "")
}
return result, nextPageURL, nil
}
6 changes: 3 additions & 3 deletions parsers/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/tidwall/gjson"
)

func ParseJSONQueries(jsonData []byte, queries []string) (interface{}, error) {
func ParseJSONQueries(jsonData []byte, queries []string) (interface{}, string, error) {

result := []interface{}{}

Expand All @@ -33,7 +33,7 @@ func ParseJSONQueries(jsonData []byte, queries []string) (interface{}, error) {
elements.ForEach(func(subi gjson.Result, subdoc gjson.Result) bool {
html := subdoc.String()

value2, _ := ParseJSONQueries([]byte(html), lines)
value2, _, _ := ParseJSONQueries([]byte(html), lines)
subresult = append(subresult, value2)
return true
})
Expand Down Expand Up @@ -72,5 +72,5 @@ func ParseJSONQueries(jsonData []byte, queries []string) (interface{}, error) {
}

}
return result, nil
return result, "", nil
}
9 changes: 8 additions & 1 deletion parsers/playwright.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@ func ExecutePlaywrightBlock(block common.Block) (interface{}, error) {
return nil, fmt.Errorf("failed to create new page: %w", err)
}

url := strings.TrimPrefix(block.Command, "playwright ")
var url string

switch cmd := block.Command.(type) {
case string:
url = strings.TrimPrefix(cmd, "playwright ")
default:
}

_, err = page.Goto(url, playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateNetworkidle,
})
Expand Down
20 changes: 13 additions & 7 deletions parsers/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,24 @@ import (
"github.com/google/shlex"
)

func ExecuteCurlBlock(block common.Block) (interface{}, error) {
func ExecuteCurlBlock(block common.Block) (interface{}, string, error) {
if !commandExists("curl") {
return nil, fmt.Errorf("curl command not found. Please install curl and try again")
return nil, "", fmt.Errorf("curl command not found. Please install curl and try again")
}

var parts []string
switch cmd := block.Command.(type) {
case string:
parts, _ = shlex.Split(cmd)
case []string:
parts = cmd
default:
}

// Split the command into curl and its arguments
parts, _ := shlex.Split(block.Command)
log.Println("Execute curl:", block.Command)
cmd := exec.Command(parts[0], parts[1:]...)
output, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("curl command failed: %w\nOutput: %s", err, string(output))
return nil, "", fmt.Errorf("curl command failed: %w\nOutput: %s", err, string(output))
}

isJSON := json.Valid(bytes.TrimSpace(output))
Expand All @@ -33,7 +39,7 @@ func ExecuteCurlBlock(block common.Block) (interface{}, error) {
return ParseJSONQueries(output, block.Queries)
} else {
log.Println("HTML detected")
return ParseHTMLQueries(output, block.Queries)
return ParseHTMLQueries(output, block.Queries, block.NextPage)
}
}

Expand Down

0 comments on commit 9a6db7c

Please sign in to comment.