|
| 1 | + |
1 | 2 | package main
|
2 | 3 |
|
3 | 4 | import (
|
| 5 | + "encoding/json" |
4 | 6 | "fmt"
|
5 | 7 | "log"
|
6 | 8 | "net/http"
|
7 | 9 | "strings"
|
| 10 | + "time" |
8 | 11 |
|
9 | 12 | "github.com/PuerkitoBio/goquery"
|
10 | 13 | "github.com/xuri/excelize/v2"
|
11 | 14 | )
|
12 | 15 |
|
13 | 16 | type ResponseData struct {
|
14 |
| - Text string `json:"text"` |
15 |
| - URL string `json:"url"` |
| 17 | + Text string `json:"text"` |
| 18 | + URL string `json:"url"` |
| 19 | + Type string `json:"type"` |
| 20 | + Tag string `json:"tag"` |
| 21 | + MetaData string `json:"metadata"` |
| 22 | + Date string `json:"date"` |
16 | 23 | }
|
17 | 24 |
|
18 | 25 | func scrapeHandler(w http.ResponseWriter, r *http.Request) {
|
| 26 | + if r.Method != http.MethodGet { |
| 27 | + http.Error(w, "Method not supported", http.StatusMethodNotAllowed) |
| 28 | + return |
| 29 | + } |
| 30 | + |
19 | 31 | url := r.URL.Query().Get("url")
|
20 | 32 | if url == "" {
|
21 |
| - http.Error(w, "URL параметр отсутствует", http.StatusBadRequest) |
| 33 | + http.Error(w, "URL parameter is missing", http.StatusBadRequest) |
22 | 34 | return
|
23 | 35 | }
|
24 | 36 |
|
| 37 | + if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") { |
| 38 | + url = "https://" + url |
| 39 | + } |
| 40 | + |
25 | 41 | response, err := http.Get(url)
|
26 | 42 | if err != nil {
|
27 |
| - http.Error(w, fmt.Sprintf("Ошибка при HTTP-запросе: %v", err), http.StatusInternalServerError) |
| 43 | + http.Error(w, fmt.Sprintf("Error during HTTP request: %v", err), http.StatusInternalServerError) |
28 | 44 | return
|
29 | 45 | }
|
30 | 46 | defer response.Body.Close()
|
31 | 47 |
|
32 | 48 | if response.StatusCode != http.StatusOK {
|
33 |
| - http.Error(w, fmt.Sprintf("Не удалось получить доступ к странице, статус код %d", response.StatusCode), http.StatusInternalServerError) |
| 49 | + http.Error(w, fmt.Sprintf("Failed to access the page, status code %d", response.StatusCode), http.StatusInternalServerError) |
34 | 50 | return
|
35 | 51 | }
|
36 | 52 |
|
37 | 53 | doc, err := goquery.NewDocumentFromReader(response.Body)
|
38 | 54 | if err != nil {
|
39 |
| - http.Error(w, fmt.Sprintf("Ошибка при чтении HTML-документа: %v", err), http.StatusInternalServerError) |
| 55 | + http.Error(w, fmt.Sprintf("Error reading HTML document: %v", err), http.StatusInternalServerError) |
40 | 56 | return
|
41 | 57 | }
|
42 | 58 |
|
43 | 59 | var data []ResponseData
|
44 |
| - doc.Find("body").Each(func(i int, body *goquery.Selection) { |
45 |
| - textData := strings.TrimSpace(body.Text()) |
46 |
| - lines := strings.Split(textData, "\n") |
47 |
| - for _, line := range lines { |
48 |
| - cleanLine := strings.TrimSpace(line) |
49 |
| - if cleanLine != "" { |
50 |
| - data = append(data, ResponseData{Text: cleanLine, URL: ""}) |
| 60 | + |
| 61 | + metaData := make(map[string]string) |
| 62 | + doc.Find("meta").Each(func(i int, s *goquery.Selection) { |
| 63 | + name, _ := s.Attr("name") |
| 64 | + property, _ := s.Attr("property") |
| 65 | + content, _ := s.Attr("content") |
| 66 | + |
| 67 | + key := name |
| 68 | + if key == "" { |
| 69 | + key = property |
| 70 | + } |
| 71 | + |
| 72 | + if key != "" && content != "" { |
| 73 | + metaData[key] = content |
| 74 | + } |
| 75 | + }) |
| 76 | + |
| 77 | + title := doc.Find("title").Text() |
| 78 | + if title != "" { |
| 79 | + data = append(data, ResponseData{ |
| 80 | + Text: title, |
| 81 | + URL: "", |
| 82 | + Type: "title", |
| 83 | + Tag: "title", |
| 84 | + MetaData: "", |
| 85 | + Date: time.Now().Format("2006-01-02"), |
| 86 | + }) |
| 87 | + } |
| 88 | + |
| 89 | + doc.Find("h1, h2, h3, h4, h5, h6").Each(func(i int, s *goquery.Selection) { |
| 90 | + text := strings.TrimSpace(s.Text()) |
| 91 | + if text != "" { |
| 92 | + headingType := s.Get(0).Data // h1, h2, etc. |
| 93 | + data = append(data, ResponseData{ |
| 94 | + Text: text, |
| 95 | + URL: "", |
| 96 | + Type: "heading", |
| 97 | + Tag: headingType, |
| 98 | + MetaData: "", |
| 99 | + Date: time.Now().Format("2006-01-02"), |
| 100 | + }) |
| 101 | + } |
| 102 | + }) |
| 103 | + |
| 104 | + doc.Find("p").Each(func(i int, s *goquery.Selection) { |
| 105 | + text := strings.TrimSpace(s.Text()) |
| 106 | + if text != "" { |
| 107 | + data = append(data, ResponseData{ |
| 108 | + Text: text, |
| 109 | + URL: "", |
| 110 | + Type: "paragraph", |
| 111 | + Tag: "p", |
| 112 | + MetaData: "", |
| 113 | + Date: time.Now().Format("2006-01-02"), |
| 114 | + }) |
| 115 | + } |
| 116 | + }) |
| 117 | + |
| 118 | + doc.Find("ul, ol").Each(func(i int, s *goquery.Selection) { |
| 119 | + listType := s.Get(0).Data // ul or ol |
| 120 | + s.Find("li").Each(func(j int, li *goquery.Selection) { |
| 121 | + text := strings.TrimSpace(li.Text()) |
| 122 | + if text != "" { |
| 123 | + data = append(data, ResponseData{ |
| 124 | + Text: text, |
| 125 | + URL: "", |
| 126 | + Type: "list-item", |
| 127 | + Tag: listType + "-li", |
| 128 | + MetaData: "", |
| 129 | + Date: time.Now().Format("2006-01-02"), |
| 130 | + }) |
51 | 131 | }
|
| 132 | + }) |
| 133 | + }) |
| 134 | + |
| 135 | + doc.Find("a").Each(func(i int, s *goquery.Selection) { |
| 136 | + href, exists := s.Attr("href") |
| 137 | + text := strings.TrimSpace(s.Text()) |
| 138 | + |
| 139 | + if exists && href != "" { |
| 140 | + if strings.HasPrefix(href, "/") { |
| 141 | + baseURL := getBaseURL(url) |
| 142 | + href = baseURL + href |
| 143 | + } |
| 144 | + |
| 145 | + data = append(data, ResponseData{ |
| 146 | + Text: text, |
| 147 | + URL: href, |
| 148 | + Type: "link", |
| 149 | + Tag: "a", |
| 150 | + MetaData: "", |
| 151 | + Date: time.Now().Format("2006-01-02"), |
| 152 | + }) |
52 | 153 | }
|
53 |
| - body.Find("a").Each(func(j int, a *goquery.Selection) { |
54 |
| - href, exists := a.Attr("href") |
55 |
| - if exists { |
56 |
| - data = append(data, ResponseData{Text: "", URL: href}) |
| 154 | + }) |
| 155 | + |
| 156 | + // Extract images |
| 157 | + doc.Find("img").Each(func(i int, s *goquery.Selection) { |
| 158 | + src, exists := s.Attr("src") |
| 159 | + alt, _ := s.Attr("alt") |
| 160 | + |
| 161 | + if exists && src != "" { |
| 162 | + // Convert relative URLs to absolute |
| 163 | + if strings.HasPrefix(src, "/") { |
| 164 | + baseURL := getBaseURL(url) |
| 165 | + src = baseURL + src |
57 | 166 | }
|
| 167 | + |
| 168 | + data = append(data, ResponseData{ |
| 169 | + Text: alt, |
| 170 | + URL: src, |
| 171 | + Type: "image", |
| 172 | + Tag: "img", |
| 173 | + MetaData: "", |
| 174 | + Date: time.Now().Format("2006-01-02"), |
| 175 | + }) |
| 176 | + } |
| 177 | + }) |
| 178 | + |
| 179 | + // Extract tables |
| 180 | + doc.Find("table").Each(func(tableIdx int, table *goquery.Selection) { |
| 181 | + tableData := "" |
| 182 | + |
| 183 | + table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) { |
| 184 | + if rowIdx > 0 { |
| 185 | + tableData += "\n" |
| 186 | + } |
| 187 | + |
| 188 | + row.Find("th, td").Each(func(colIdx int, cell *goquery.Selection) { |
| 189 | + if colIdx > 0 { |
| 190 | + tableData += " | " |
| 191 | + } |
| 192 | + tableData += strings.TrimSpace(cell.Text()) |
| 193 | + }) |
58 | 194 | })
|
| 195 | + |
| 196 | + if tableData != "" { |
| 197 | + data = append(data, ResponseData{ |
| 198 | + Text: tableData, |
| 199 | + URL: "", |
| 200 | + Type: "table", |
| 201 | + Tag: "table", |
| 202 | + MetaData: "", |
| 203 | + Date: time.Now().Format("2006-01-02"), |
| 204 | + }) |
| 205 | + } |
59 | 206 | })
|
60 | 207 |
|
| 208 | + // Add metadata as separate entries |
| 209 | + for key, value := range metaData { |
| 210 | + data = append(data, ResponseData{ |
| 211 | + Text: value, |
| 212 | + URL: "", |
| 213 | + Type: "metadata", |
| 214 | + Tag: key, |
| 215 | + MetaData: "", |
| 216 | + Date: time.Now().Format("2006-01-02"), |
| 217 | + }) |
| 218 | + } |
| 219 | + |
| 220 | + acceptHeader := r.Header.Get("Accept") |
| 221 | + if strings.Contains(acceptHeader, "application/json") { |
| 222 | + w.Header().Set("Content-Type", "application/json") |
| 223 | + if err := json.NewEncoder(w).Encode(data); err != nil { |
| 224 | + http.Error(w, fmt.Sprintf("Error encoding JSON: %v", err), http.StatusInternalServerError) |
| 225 | + } |
| 226 | + return |
| 227 | + } |
| 228 | + |
| 229 | + // Create Excel file |
61 | 230 | f := excelize.NewFile()
|
62 | 231 | index := f.NewSheet("Sheet1")
|
63 | 232 |
|
| 233 | + // Set headers |
| 234 | + headers := []string{"Content Type", "HTML Tag", "Text", "URL", "Metadata", "Date"} |
| 235 | + for i, header := range headers { |
| 236 | + cell := fmt.Sprintf("%c1", 'A'+i) |
| 237 | + f.SetCellValue("Sheet1", cell, header) |
| 238 | + } |
| 239 | + |
| 240 | + // Style headers |
| 241 | + headerStyle, err := f.NewStyle(&excelize.Style{ |
| 242 | + Font: &excelize.Font{Bold: true, Size: 12}, |
| 243 | + Fill: excelize.Fill{Type: "pattern", Color: []string{"#DDDDDD"}, Pattern: 1}, |
| 244 | + }) |
| 245 | + if err == nil { |
| 246 | + f.SetCellStyle("Sheet1", "A1", string(rune('A'+len(headers)-1))+"1", headerStyle) |
| 247 | + } |
| 248 | + |
| 249 | + // Populate data |
64 | 250 | for i, item := range data {
|
65 |
| - if item.Text != "" { |
66 |
| - cell := fmt.Sprintf("A%d", i+1) |
67 |
| - f.SetCellValue("Sheet1", cell, item.Text) |
68 |
| - } else if item.URL != "" { |
69 |
| - cell := fmt.Sprintf("B%d", i+1) |
70 |
| - f.SetCellValue("Sheet1", cell, item.URL) |
71 |
| - } |
| 251 | + rowIdx := i + 2 // +2 because headers are at row 1 |
| 252 | + f.SetCellValue("Sheet1", fmt.Sprintf("A%d", rowIdx), item.Type) |
| 253 | + f.SetCellValue("Sheet1", fmt.Sprintf("B%d", rowIdx), item.Tag) |
| 254 | + f.SetCellValue("Sheet1", fmt.Sprintf("C%d", rowIdx), item.Text) |
| 255 | + f.SetCellValue("Sheet1", fmt.Sprintf("D%d", rowIdx), item.URL) |
| 256 | + f.SetCellValue("Sheet1", fmt.Sprintf("E%d", rowIdx), item.MetaData) |
| 257 | + f.SetCellValue("Sheet1", fmt.Sprintf("F%d", rowIdx), item.Date) |
72 | 258 | }
|
73 | 259 |
|
| 260 | + // Auto column width |
| 261 | + f.SetColWidth("Sheet1", "A", "A", 15) |
| 262 | + f.SetColWidth("Sheet1", "B", "B", 15) |
| 263 | + f.SetColWidth("Sheet1", "C", "C", 60) |
| 264 | + f.SetColWidth("Sheet1", "D", "D", 40) |
| 265 | + f.SetColWidth("Sheet1", "E", "E", 20) |
| 266 | + f.SetColWidth("Sheet1", "F", "F", 15) |
| 267 | + |
74 | 268 | f.SetActiveSheet(index)
|
75 | 269 |
|
| 270 | + // Set headers for download |
| 271 | + fileName := getCleanDomainName(url) + "_data.xlsx" |
76 | 272 | w.Header().Set("Content-Type", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
77 |
| - w.Header().Set("Content-Disposition", "attachment;filename=results.xlsx") |
78 |
| - w.Header().Set("File-Name", "results.xlsx") |
| 273 | + w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s", fileName)) |
79 | 274 | w.Header().Set("Content-Transfer-Encoding", "binary")
|
| 275 | + w.Header().Set("Expires", "0") |
80 | 276 |
|
| 277 | + // Write Excel file |
81 | 278 | if err := f.Write(w); err != nil {
|
82 |
| - http.Error(w, fmt.Sprintf("Ошибка при записи Excel файла: %v", err), http.StatusInternalServerError) |
| 279 | + http.Error(w, fmt.Sprintf("Error writing Excel file: %v", err), http.StatusInternalServerError) |
83 | 280 | }
|
84 | 281 | }
|
85 | 282 |
|
| 283 | +func getBaseURL(url string) string { |
| 284 | + parts := strings.Split(url, "/") |
| 285 | + if len(parts) >= 3 { |
| 286 | + return strings.Join(parts[:3], "/") |
| 287 | + } |
| 288 | + return url |
| 289 | +} |
| 290 | + |
| 291 | +func getCleanDomainName(url string) string { |
| 292 | + url = strings.TrimPrefix(url, "http://") |
| 293 | + url = strings.TrimPrefix(url, "https://") |
| 294 | + url = strings.TrimPrefix(url, "www.") |
| 295 | + |
| 296 | + parts := strings.Split(url, "/") |
| 297 | + domain := parts[0] |
| 298 | + domain = strings.ReplaceAll(domain, ".", "_") |
| 299 | + |
| 300 | + return domain |
| 301 | +} |
| 302 | + |
86 | 303 | func main() {
|
87 | 304 | http.HandleFunc("/scrape", scrapeHandler)
|
88 | 305 |
|
89 | 306 | fs := http.FileServer(http.Dir("./"))
|
90 | 307 | http.Handle("/", fs)
|
91 | 308 |
|
92 |
| - log.Println("Сервер запущен на http://localhost:8080") |
93 |
| - log.Fatal(http.ListenAndServe(":8080", nil)) |
| 309 | + port := ":8080" |
| 310 | + log.Printf("Server started at http://0.0.0.0%s", port) |
| 311 | + log.Fatal(http.ListenAndServe("0.0.0.0"+port, nil)) |
94 | 312 | }
|
0 commit comments