Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3주차 업로드 sw-develop #12

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions sewon/home.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<title>Go Jobs</title>
</head>
<body>
<h1>Go Jobs</h1>
<h3>Indeed.com scrapper</h3>
<form method="POST" action="/scrape">
<input placeholder="what job do u want" name="term" />
<button>Search</button>
</form>
</body>
</html>
56 changes: 21 additions & 35 deletions sewon/main.go
Original file line number Diff line number Diff line change
@@ -1,42 +1,28 @@
package main // 어떤 패키지를 사용하는지 명시해줌, main.go 파일의 경우 컴파일을 위해서 필요한 것임(필수)
package main

import (
"errors"
"fmt"
"net/http"
"github.com/labstack/echo"
"github.com/sw-develop/learngo/golang_study/sewon/scrapper"
"os"
"strings"
)

var errRequestFailed = errors.New("request failed")
const fileName string = "jobs.csv"

func main() {
var results = make(map[string]string) // map 생성 및 초기화, make() : built-in func
urls := []string{
"https://www.airbnb.com/",
"https://www.google.com/",
"https://www.amazon.com/",
"https://www.reddit.com/",
"https://www.google.com/",
"https://soundcloud.com/",
"https://www.facebook.com/",
"https://www.instagram.com/",
}
for _, url := range urls {
result := "OK"
err := hitURL(url)
if err != nil {
result = "FAILED"
}
results[url] = result
}
for url, result := range results {
fmt.Println(url, result)
}
func handleHome(c echo.Context) error {
return c.File("home.html")
}

func hitURL(url string) error {
fmt.Println("Checking: ", url)
res, err := http.Get(url)
if err != nil || res.StatusCode >= 400 {
return errRequestFailed
}
return nil
func handleScrape(c echo.Context) error {
defer os.Remove(fileName)
term := strings.ToLower(scrapper.CleanString(c.FormValue("term")))
scrapper.Scrape(term) // Scrape() 실행
return c.Attachment(fileName, fileName)
}

func main() {
e := echo.New()
e.GET("/", handleHome)
e.POST("/scrape", handleScrape)
e.Logger.Fatal(e.Start(":1323"))
}
166 changes: 166 additions & 0 deletions sewon/scrapper/scrapper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package scrapper

import (
"encoding/csv"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"
)

type extractedJob struct {
id string
title string
companyName string
location string
}

func timeTrack(start time.Time, name string) {
elapsed := time.Since(start)
log.Printf("%s took %s", name, elapsed)
}

func Scrape(term string) {
defer timeTrack(time.Now(), "JobScrapper")

var baseURL = "https://kr.indeed.com/jobs?q=" + term + "&limit=50"

var jobs []extractedJob
mainChannel := make(chan []extractedJob)
totalPages := getPages(baseURL)

for i := 0; i < totalPages; i++ {
go getPage(i, baseURL, mainChannel)
}

for i := 0; i < totalPages; i++ {
extractedJob := <-mainChannel
jobs = append(jobs, extractedJob...)
}

writeJobs(jobs)
fmt.Println("Done, extracted : ", len(jobs))
}

func getPage(pageNum int, baseURL string, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출
var jobs []extractedJob

channel := make(chan extractedJob) // channel 생성

pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50)
fmt.Println("Requesting: ", pageURL)

res, err := http.Get(pageURL)
checkErr(err)
checkHttpStatus(res)

defer func(Body io.ReadCloser) { // error handling for io.ReadCloser
err := Body.Close()
if err != nil {

}
}(res.Body)

doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)

searchJobs := doc.Find(".tapItem")

searchJobs.Each(func(i int, selection *goquery.Selection) {
go extractJob(selection, channel) // goroutine 적용
})

for i := 0; i < searchJobs.Length(); i++ {
job := <-channel
jobs = append(jobs, job)
}

mainChannel <- jobs
}

func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only
id, _ := selection.Attr("data-jk")
title := CleanString(selection.Find("h2>span").Text())
companyName := CleanString(selection.Find(".companyName").Text())
location := CleanString(selection.Find(".companyLocation").Text())

channel <- extractedJob{ // send to channel
id: id,
title: title,
companyName: companyName,
location: location}
}

func getPages(baseURL string) int { // pagination 수 반환
pages := 0
res, err := http.Get(baseURL)
checkErr(err)
checkHttpStatus(res)

defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {

}
}(res.Body)

// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)

doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) {
// For each item found, count pages
pages = selection.Find("a").Length() // <a> 태그 찾기
})

return pages
}

func writeJobs(jobs []extractedJob) {
file, err := os.Create("jobs.csv")
checkErr(err)

// 한글 인코딩
utf8bom := []byte{0xEF, 0xBB, 0xBF}
_, encodingErr := file.Write(utf8bom)
checkErr(encodingErr)

w := csv.NewWriter(file)
defer w.Flush() // defer : 지연실행

headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"}

writeErr := w.Write(headers)
checkErr(writeErr)

for _, job := range jobs { // _ : index, job : 요소 값
jobSlice := []string{
"https://kr.indeed.com/viewjob?jk=" + job.id,
job.title,
job.companyName,
job.location}
jobWriteErr := w.Write(jobSlice)
checkErr(jobWriteErr)
}
}

func checkErr(err error) { // 에러 처리
if err != nil {
log.Fatalln(err)
}
}

func checkHttpStatus(res *http.Response) {
if res.StatusCode != 200 {
log.Fatalln("Request failed with Status:", res.StatusCode)
}
}

func CleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과)
return strings.Join(strings.Fields(strings.TrimSpace(str)), " ")
}