Skip to content

Commit 3c3a7ff

Browse files
authored
# enhancement of web scraping functionality and data structuring in Excel output
1 parent 26e49fb commit 3c3a7ff

File tree

1 file changed

+247
-29
lines changed

1 file changed

+247
-29
lines changed

main.go

+247-29
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,312 @@
1+
12
package main
23

34
import (
5+
"encoding/json"
46
"fmt"
57
"log"
68
"net/http"
79
"strings"
10+
"time"
811

912
"github.com/PuerkitoBio/goquery"
1013
"github.com/xuri/excelize/v2"
1114
)
1215

1316
type ResponseData struct {
14-
Text string `json:"text"`
15-
URL string `json:"url"`
17+
Text string `json:"text"`
18+
URL string `json:"url"`
19+
Type string `json:"type"`
20+
Tag string `json:"tag"`
21+
MetaData string `json:"metadata"`
22+
Date string `json:"date"`
1623
}
1724

1825
func scrapeHandler(w http.ResponseWriter, r *http.Request) {
26+
if r.Method != http.MethodGet {
27+
http.Error(w, "Method not supported", http.StatusMethodNotAllowed)
28+
return
29+
}
30+
1931
url := r.URL.Query().Get("url")
2032
if url == "" {
21-
http.Error(w, "URL параметр отсутствует", http.StatusBadRequest)
33+
http.Error(w, "URL parameter is missing", http.StatusBadRequest)
2234
return
2335
}
2436

37+
if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
38+
url = "https://" + url
39+
}
40+
2541
response, err := http.Get(url)
2642
if err != nil {
27-
http.Error(w, fmt.Sprintf("Ошибка при HTTP-запросе: %v", err), http.StatusInternalServerError)
43+
http.Error(w, fmt.Sprintf("Error during HTTP request: %v", err), http.StatusInternalServerError)
2844
return
2945
}
3046
defer response.Body.Close()
3147

3248
if response.StatusCode != http.StatusOK {
33-
http.Error(w, fmt.Sprintf("Не удалось получить доступ к странице, статус код %d", response.StatusCode), http.StatusInternalServerError)
49+
http.Error(w, fmt.Sprintf("Failed to access the page, status code %d", response.StatusCode), http.StatusInternalServerError)
3450
return
3551
}
3652

3753
doc, err := goquery.NewDocumentFromReader(response.Body)
3854
if err != nil {
39-
http.Error(w, fmt.Sprintf("Ошибка при чтении HTML-документа: %v", err), http.StatusInternalServerError)
55+
http.Error(w, fmt.Sprintf("Error reading HTML document: %v", err), http.StatusInternalServerError)
4056
return
4157
}
4258

4359
var data []ResponseData
44-
doc.Find("body").Each(func(i int, body *goquery.Selection) {
45-
textData := strings.TrimSpace(body.Text())
46-
lines := strings.Split(textData, "\n")
47-
for _, line := range lines {
48-
cleanLine := strings.TrimSpace(line)
49-
if cleanLine != "" {
50-
data = append(data, ResponseData{Text: cleanLine, URL: ""})
60+
61+
metaData := make(map[string]string)
62+
doc.Find("meta").Each(func(i int, s *goquery.Selection) {
63+
name, _ := s.Attr("name")
64+
property, _ := s.Attr("property")
65+
content, _ := s.Attr("content")
66+
67+
key := name
68+
if key == "" {
69+
key = property
70+
}
71+
72+
if key != "" && content != "" {
73+
metaData[key] = content
74+
}
75+
})
76+
77+
title := doc.Find("title").Text()
78+
if title != "" {
79+
data = append(data, ResponseData{
80+
Text: title,
81+
URL: "",
82+
Type: "title",
83+
Tag: "title",
84+
MetaData: "",
85+
Date: time.Now().Format("2006-01-02"),
86+
})
87+
}
88+
89+
doc.Find("h1, h2, h3, h4, h5, h6").Each(func(i int, s *goquery.Selection) {
90+
text := strings.TrimSpace(s.Text())
91+
if text != "" {
92+
headingType := s.Get(0).Data // h1, h2, etc.
93+
data = append(data, ResponseData{
94+
Text: text,
95+
URL: "",
96+
Type: "heading",
97+
Tag: headingType,
98+
MetaData: "",
99+
Date: time.Now().Format("2006-01-02"),
100+
})
101+
}
102+
})
103+
104+
doc.Find("p").Each(func(i int, s *goquery.Selection) {
105+
text := strings.TrimSpace(s.Text())
106+
if text != "" {
107+
data = append(data, ResponseData{
108+
Text: text,
109+
URL: "",
110+
Type: "paragraph",
111+
Tag: "p",
112+
MetaData: "",
113+
Date: time.Now().Format("2006-01-02"),
114+
})
115+
}
116+
})
117+
118+
doc.Find("ul, ol").Each(func(i int, s *goquery.Selection) {
119+
listType := s.Get(0).Data // ul or ol
120+
s.Find("li").Each(func(j int, li *goquery.Selection) {
121+
text := strings.TrimSpace(li.Text())
122+
if text != "" {
123+
data = append(data, ResponseData{
124+
Text: text,
125+
URL: "",
126+
Type: "list-item",
127+
Tag: listType + "-li",
128+
MetaData: "",
129+
Date: time.Now().Format("2006-01-02"),
130+
})
51131
}
132+
})
133+
})
134+
135+
doc.Find("a").Each(func(i int, s *goquery.Selection) {
136+
href, exists := s.Attr("href")
137+
text := strings.TrimSpace(s.Text())
138+
139+
if exists && href != "" {
140+
if strings.HasPrefix(href, "/") {
141+
baseURL := getBaseURL(url)
142+
href = baseURL + href
143+
}
144+
145+
data = append(data, ResponseData{
146+
Text: text,
147+
URL: href,
148+
Type: "link",
149+
Tag: "a",
150+
MetaData: "",
151+
Date: time.Now().Format("2006-01-02"),
152+
})
52153
}
53-
body.Find("a").Each(func(j int, a *goquery.Selection) {
54-
href, exists := a.Attr("href")
55-
if exists {
56-
data = append(data, ResponseData{Text: "", URL: href})
154+
})
155+
156+
// Extract images
157+
doc.Find("img").Each(func(i int, s *goquery.Selection) {
158+
src, exists := s.Attr("src")
159+
alt, _ := s.Attr("alt")
160+
161+
if exists && src != "" {
162+
// Convert relative URLs to absolute
163+
if strings.HasPrefix(src, "/") {
164+
baseURL := getBaseURL(url)
165+
src = baseURL + src
57166
}
167+
168+
data = append(data, ResponseData{
169+
Text: alt,
170+
URL: src,
171+
Type: "image",
172+
Tag: "img",
173+
MetaData: "",
174+
Date: time.Now().Format("2006-01-02"),
175+
})
176+
}
177+
})
178+
179+
// Extract tables
180+
doc.Find("table").Each(func(tableIdx int, table *goquery.Selection) {
181+
tableData := ""
182+
183+
table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
184+
if rowIdx > 0 {
185+
tableData += "\n"
186+
}
187+
188+
row.Find("th, td").Each(func(colIdx int, cell *goquery.Selection) {
189+
if colIdx > 0 {
190+
tableData += " | "
191+
}
192+
tableData += strings.TrimSpace(cell.Text())
193+
})
58194
})
195+
196+
if tableData != "" {
197+
data = append(data, ResponseData{
198+
Text: tableData,
199+
URL: "",
200+
Type: "table",
201+
Tag: "table",
202+
MetaData: "",
203+
Date: time.Now().Format("2006-01-02"),
204+
})
205+
}
59206
})
60207

208+
// Add metadata as separate entries
209+
for key, value := range metaData {
210+
data = append(data, ResponseData{
211+
Text: value,
212+
URL: "",
213+
Type: "metadata",
214+
Tag: key,
215+
MetaData: "",
216+
Date: time.Now().Format("2006-01-02"),
217+
})
218+
}
219+
220+
acceptHeader := r.Header.Get("Accept")
221+
if strings.Contains(acceptHeader, "application/json") {
222+
w.Header().Set("Content-Type", "application/json")
223+
if err := json.NewEncoder(w).Encode(data); err != nil {
224+
http.Error(w, fmt.Sprintf("Error encoding JSON: %v", err), http.StatusInternalServerError)
225+
}
226+
return
227+
}
228+
229+
// Create Excel file
61230
f := excelize.NewFile()
62231
index := f.NewSheet("Sheet1")
63232

233+
// Set headers
234+
headers := []string{"Content Type", "HTML Tag", "Text", "URL", "Metadata", "Date"}
235+
for i, header := range headers {
236+
cell := fmt.Sprintf("%c1", 'A'+i)
237+
f.SetCellValue("Sheet1", cell, header)
238+
}
239+
240+
// Style headers
241+
headerStyle, err := f.NewStyle(&excelize.Style{
242+
Font: &excelize.Font{Bold: true, Size: 12},
243+
Fill: excelize.Fill{Type: "pattern", Color: []string{"#DDDDDD"}, Pattern: 1},
244+
})
245+
if err == nil {
246+
f.SetCellStyle("Sheet1", "A1", string(rune('A'+len(headers)-1))+"1", headerStyle)
247+
}
248+
249+
// Populate data
64250
for i, item := range data {
65-
if item.Text != "" {
66-
cell := fmt.Sprintf("A%d", i+1)
67-
f.SetCellValue("Sheet1", cell, item.Text)
68-
} else if item.URL != "" {
69-
cell := fmt.Sprintf("B%d", i+1)
70-
f.SetCellValue("Sheet1", cell, item.URL)
71-
}
251+
rowIdx := i + 2 // +2 because headers are at row 1
252+
f.SetCellValue("Sheet1", fmt.Sprintf("A%d", rowIdx), item.Type)
253+
f.SetCellValue("Sheet1", fmt.Sprintf("B%d", rowIdx), item.Tag)
254+
f.SetCellValue("Sheet1", fmt.Sprintf("C%d", rowIdx), item.Text)
255+
f.SetCellValue("Sheet1", fmt.Sprintf("D%d", rowIdx), item.URL)
256+
f.SetCellValue("Sheet1", fmt.Sprintf("E%d", rowIdx), item.MetaData)
257+
f.SetCellValue("Sheet1", fmt.Sprintf("F%d", rowIdx), item.Date)
72258
}
73259

260+
// Auto column width
261+
f.SetColWidth("Sheet1", "A", "A", 15)
262+
f.SetColWidth("Sheet1", "B", "B", 15)
263+
f.SetColWidth("Sheet1", "C", "C", 60)
264+
f.SetColWidth("Sheet1", "D", "D", 40)
265+
f.SetColWidth("Sheet1", "E", "E", 20)
266+
f.SetColWidth("Sheet1", "F", "F", 15)
267+
74268
f.SetActiveSheet(index)
75269

270+
// Set headers for download
271+
fileName := getCleanDomainName(url) + "_data.xlsx"
76272
w.Header().Set("Content-Type", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
77-
w.Header().Set("Content-Disposition", "attachment;filename=results.xlsx")
78-
w.Header().Set("File-Name", "results.xlsx")
273+
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s", fileName))
79274
w.Header().Set("Content-Transfer-Encoding", "binary")
275+
w.Header().Set("Expires", "0")
80276

277+
// Write Excel file
81278
if err := f.Write(w); err != nil {
82-
http.Error(w, fmt.Sprintf("Ошибка при записи Excel файла: %v", err), http.StatusInternalServerError)
279+
http.Error(w, fmt.Sprintf("Error writing Excel file: %v", err), http.StatusInternalServerError)
83280
}
84281
}
85282

283+
func getBaseURL(url string) string {
284+
parts := strings.Split(url, "/")
285+
if len(parts) >= 3 {
286+
return strings.Join(parts[:3], "/")
287+
}
288+
return url
289+
}
290+
291+
func getCleanDomainName(url string) string {
292+
url = strings.TrimPrefix(url, "http://")
293+
url = strings.TrimPrefix(url, "https://")
294+
url = strings.TrimPrefix(url, "www.")
295+
296+
parts := strings.Split(url, "/")
297+
domain := parts[0]
298+
domain = strings.ReplaceAll(domain, ".", "_")
299+
300+
return domain
301+
}
302+
86303
func main() {
87304
http.HandleFunc("/scrape", scrapeHandler)
88305

89306
fs := http.FileServer(http.Dir("./"))
90307
http.Handle("/", fs)
91308

92-
log.Println("Сервер запущен на http://localhost:8080")
93-
log.Fatal(http.ListenAndServe(":8080", nil))
309+
port := ":8080"
310+
log.Printf("Server started at http://0.0.0.0%s", port)
311+
log.Fatal(http.ListenAndServe("0.0.0.0"+port, nil))
94312
}

0 commit comments

Comments
 (0)