diff --git a/sewon/home.html b/sewon/home.html new file mode 100644 index 0000000..94c50a2 --- /dev/null +++ b/sewon/home.html @@ -0,0 +1,17 @@ + + + + + + + Go Jobs + + +

Go Jobs

+

Indeed.com scrapper

+
+ + +
+ + \ No newline at end of file diff --git a/sewon/main.go b/sewon/main.go index 90c6a8a..6ba05ec 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -1,42 +1,28 @@ -package main // 어떤 패키지를 사용하는지 명시해줌, main.go 파일의 경우 컴파일을 위해서 필요한 것임(필수) +package main + import ( - "errors" - "fmt" - "net/http" + "github.com/labstack/echo" + "github.com/sw-develop/learngo/golang_study/sewon/scrapper" + "os" + "strings" ) -var errRequestFailed = errors.New("request failed") +const fileName string = "jobs.csv" -func main() { - var results = make(map[string]string) // map 생성 및 초기화, make() : built-in func - urls := []string{ - "https://www.airbnb.com/", - "https://www.google.com/", - "https://www.amazon.com/", - "https://www.reddit.com/", - "https://www.google.com/", - "https://soundcloud.com/", - "https://www.facebook.com/", - "https://www.instagram.com/", - } - for _, url := range urls { - result := "OK" - err := hitURL(url) - if err != nil { - result = "FAILED" - } - results[url] = result - } - for url, result := range results { - fmt.Println(url, result) - } +func handleHome(c echo.Context) error { + return c.File("home.html") } -func hitURL(url string) error { - fmt.Println("Checking: ", url) - res, err := http.Get(url) - if err != nil || res.StatusCode >= 400 { - return errRequestFailed - } - return nil +func handleScrape(c echo.Context) error { + defer os.Remove(fileName) + term := strings.ToLower(scrapper.CleanString(c.FormValue("term"))) + scrapper.Scrape(term) // Scrape() 실행 + return c.Attachment(fileName, fileName) +} + +func main() { + e := echo.New() + e.GET("/", handleHome) + e.POST("/scrape", handleScrape) + e.Logger.Fatal(e.Start(":1323")) } diff --git a/sewon/scrapper/scrapper.go b/sewon/scrapper/scrapper.go new file mode 100644 index 0000000..fd2cb66 --- /dev/null +++ b/sewon/scrapper/scrapper.go @@ -0,0 +1,166 @@ +package scrapper + +import ( + "encoding/csv" + "fmt" + "github.com/PuerkitoBio/goquery" + "io" + "log" + "net/http" + "os" + "strconv" + "strings" + "time" +) + +type extractedJob struct { + id string + title string + companyName string + location string +} + +func timeTrack(start time.Time, name string) { + elapsed := time.Since(start) + log.Printf("%s took %s", name, elapsed) +} + +func Scrape(term string) { + defer timeTrack(time.Now(), "JobScrapper") + + var baseURL = "https://kr.indeed.com/jobs?q=" + term + "&limit=50" + + var jobs []extractedJob + mainChannel := make(chan []extractedJob) + totalPages := getPages(baseURL) + + for i := 0; i < totalPages; i++ { + go getPage(i, baseURL, mainChannel) + } + + for i := 0; i < totalPages; i++ { + extractedJob := <-mainChannel + jobs = append(jobs, extractedJob...) + } + + writeJobs(jobs) + fmt.Println("Done, extracted : ", len(jobs)) +} + +func getPage(pageNum int, baseURL string, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출 + var jobs []extractedJob + + channel := make(chan extractedJob) // channel 생성 + + pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) + fmt.Println("Requesting: ", pageURL) + + res, err := http.Get(pageURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { // error handling for io.ReadCloser + err := Body.Close() + if err != nil { + + } + }(res.Body) + + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + searchJobs := doc.Find(".tapItem") + + searchJobs.Each(func(i int, selection *goquery.Selection) { + go extractJob(selection, channel) // goroutine 적용 + }) + + for i := 0; i < searchJobs.Length(); i++ { + job := <-channel + jobs = append(jobs, job) + } + + mainChannel <- jobs +} + +func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only + id, _ := selection.Attr("data-jk") + title := CleanString(selection.Find("h2>span").Text()) + companyName := CleanString(selection.Find(".companyName").Text()) + location := CleanString(selection.Find(".companyLocation").Text()) + + channel <- extractedJob{ // send to channel + id: id, + title: title, + companyName: companyName, + location: location} +} + +func getPages(baseURL string) int { // pagination 수 반환 + pages := 0 + res, err := http.Get(baseURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { + err := Body.Close() + if err != nil { + + } + }(res.Body) + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) { + // For each item found, count pages + pages = selection.Find("a").Length() // 태그 찾기 + }) + + return pages +} + +func writeJobs(jobs []extractedJob) { + file, err := os.Create("jobs.csv") + checkErr(err) + + // 한글 인코딩 + utf8bom := []byte{0xEF, 0xBB, 0xBF} + _, encodingErr := file.Write(utf8bom) + checkErr(encodingErr) + + w := csv.NewWriter(file) + defer w.Flush() // defer : 지연실행 + + headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} + + writeErr := w.Write(headers) + checkErr(writeErr) + + for _, job := range jobs { // _ : index, job : 요소 값 + jobSlice := []string{ + "https://kr.indeed.com/viewjob?jk=" + job.id, + job.title, + job.companyName, + job.location} + jobWriteErr := w.Write(jobSlice) + checkErr(jobWriteErr) + } +} + +func checkErr(err error) { // 에러 처리 + if err != nil { + log.Fatalln(err) + } +} + +func checkHttpStatus(res *http.Response) { + if res.StatusCode != 200 { + log.Fatalln("Request failed with Status:", res.StatusCode) + } +} + +func CleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) + return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") +}