From 8ed8c4bd23ba47bdbf68ffeebbc91f68a516b4d0 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Mon, 11 Oct 2021 22:32:30 +0900 Subject: [PATCH 01/13] =?UTF-8?q?[=EC=88=98=EC=A0=95]=20goroutine=EA=B3=BC?= =?UTF-8?q?=20channel=EB=A5=BC=20=EC=82=AC=EC=9A=A9=ED=95=B4=20http=20requ?= =?UTF-8?q?est=20=EB=B3=B4=EB=82=B4=EA=B3=A0=20=EC=9D=91=EB=8B=B5=20?= =?UTF-8?q?=EC=B6=9C=EB=A0=A5=ED=95=98=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 90c6a8a..2c19654 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -1,14 +1,19 @@ -package main // 어떤 패키지를 사용하는지 명시해줌, main.go 파일의 경우 컴파일을 위해서 필요한 것임(필수) +package main + import ( - "errors" "fmt" "net/http" ) -var errRequestFailed = errors.New("request failed") +type requestResult struct { // http 요청 값을 저장할 구조체 + url string + status string +} func main() { - var results = make(map[string]string) // map 생성 및 초기화, make() : built-in func + results := make(map[string]string) + channel := make(chan requestResult) + urls := []string{ "https://www.airbnb.com/", "https://www.google.com/", @@ -20,23 +25,25 @@ func main() { "https://www.instagram.com/", } for _, url := range urls { - result := "OK" - err := hitURL(url) - if err != nil { - result = "FAILED" - } - results[url] = result + go hitURL(url, channel) // using goroutine } - for url, result := range results { - fmt.Println(url, result) + + for i := 0; i < len(urls); i++ { + result := <-channel + results[result.url] = result.status } + + for url, status := range results { + fmt.Println(url, status) + } + } -func hitURL(url string) error { - fmt.Println("Checking: ", url) +func hitURL(url string, channel chan<- requestResult) { // chan<- : Send Only res, err := http.Get(url) + status := "OK" if err != nil || res.StatusCode >= 400 { - return errRequestFailed + status = "FAILED" } - return nil + channel <- requestResult{url: url, status: status} } From bd2b870bca3b13779e3925e68f7bdfbd4cd496f2 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 01:01:52 +0900 Subject: [PATCH 02/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20goquery=EB=A5=BC?= =?UTF-8?q?=20=EC=82=AC=EC=9A=A9=ED=95=B4=20pagination=EB=90=9C=20?= =?UTF-8?q?=EA=B0=81=20=ED=8E=98=EC=9D=B4=EC=A7=80=EC=9D=98=20job=EB=93=A4?= =?UTF-8?q?=EC=9D=98=20id,=20title,=20location=20=EC=A0=95=EB=B3=B4=20?= =?UTF-8?q?=EA=B0=80=EC=A0=B8=EC=98=A4=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 106 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 2c19654..6232a0c 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -2,48 +2,92 @@ package main import ( "fmt" + "github.com/PuerkitoBio/goquery" + "io" + "log" "net/http" + "strconv" ) -type requestResult struct { // http 요청 값을 저장할 구조체 - url string - status string +type extractedJob struct { + id string + title string + location string + salary string + summary string } +var baseURL string = "https://kr.indeed.com/jobs?q=python&limit=50" + func main() { - results := make(map[string]string) - channel := make(chan requestResult) - - urls := []string{ - "https://www.airbnb.com/", - "https://www.google.com/", - "https://www.amazon.com/", - "https://www.reddit.com/", - "https://www.google.com/", - "https://soundcloud.com/", - "https://www.facebook.com/", - "https://www.instagram.com/", - } - for _, url := range urls { - go hitURL(url, channel) // using goroutine - } + totalPages := getPages() - for i := 0; i < len(urls); i++ { - result := <-channel - results[result.url] = result.status + for i := 0; i < totalPages; i++ { + getPage(i) } +} - for url, status := range results { - fmt.Println(url, status) - } +func getPage(pageNum int) { // pagination된 페이지 호출 + pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) + fmt.Println("Requesting: ", pageURL) + + res, err := http.Get(pageURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { // error handling for io.ReadCloser + err := Body.Close() + if err != nil { + + } + }(res.Body) + + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + searchJobs := doc.Find(".tapItem") + searchJobs.Each(func(i int, selection *goquery.Selection) { + id, _ := selection.Attr("data-jk") + title := selection.Find("h2>span").Text() + location := selection.Find(".companyLocation").Text() + fmt.Println(id, title, location) + }) +} + +func getPages() int { // pagination 수 반환 + pages := 0 + res, err := http.Get(baseURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { + err := Body.Close() + if err != nil { + + } + }(res.Body) + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) { + // For each item found, count pages + pages = selection.Find("a").Length() // 태그 찾기 + }) + + return pages +} + +func checkErr(err error) { // 에러 처리 + if err != nil { + log.Fatalln(err) + } } -func hitURL(url string, channel chan<- requestResult) { // chan<- : Send Only - res, err := http.Get(url) - status := "OK" - if err != nil || res.StatusCode >= 400 { - status = "FAILED" +func checkHttpStatus(res *http.Response) { + if res.StatusCode != 200 { + log.Fatalln("Request failed with Status:", res.StatusCode) } - channel <- requestResult{url: url, status: status} } From 8e348dbe054476f8d79ed761abf1f9c1ce8a3a0c Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 01:36:44 +0900 Subject: [PATCH 03/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20strings=20?= =?UTF-8?q?=EB=9D=BC=EC=9D=B4=EB=B8=8C=EB=9F=AC=EB=A6=AC=EC=9D=98=20Join,?= =?UTF-8?q?=20Fields,=20TrimSpace=20=EB=A9=94=EC=84=9C=EB=93=9C=EB=A5=BC?= =?UTF-8?q?=20=ED=99=9C=EC=9A=A9=ED=95=B4=20=EA=B3=B5=EB=B0=B1=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0=20/=20=EA=B5=AC=EC=A1=B0=EC=B2=B4=20=EB=B0=B0?= =?UTF-8?q?=EC=97=B4=EC=97=90=20=EC=9B=90=EC=86=8C=20=EB=8B=B4=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 6232a0c..259ed12 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -7,27 +7,32 @@ import ( "log" "net/http" "strconv" + "strings" ) type extractedJob struct { id string title string location string - salary string - summary string } var baseURL string = "https://kr.indeed.com/jobs?q=python&limit=50" func main() { + var jobs []extractedJob totalPages := getPages() for i := 0; i < totalPages; i++ { - getPage(i) + extractedJobs := getPage(i) + jobs = append(jobs, extractedJobs...) } + + fmt.Println(jobs) } -func getPage(pageNum int) { // pagination된 페이지 호출 +func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 + var jobs []extractedJob + pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) fmt.Println("Requesting: ", pageURL) @@ -48,11 +53,22 @@ func getPage(pageNum int) { // pagination된 페이지 호출 searchJobs := doc.Find(".tapItem") searchJobs.Each(func(i int, selection *goquery.Selection) { - id, _ := selection.Attr("data-jk") - title := selection.Find("h2>span").Text() - location := selection.Find(".companyLocation").Text() - fmt.Println(id, title, location) + job := extractJob(selection) + jobs = append(jobs, job) // slice }) + + return jobs +} + +func extractJob(selection *goquery.Selection) extractedJob { + id, _ := selection.Attr("data-jk") + title := cleanString(selection.Find("h2>span").Text()) + location := cleanString(selection.Find(".companyLocation").Text()) + + return extractedJob{ + id: id, + title: title, + location: location} } func getPages() int { // pagination 수 반환 @@ -91,3 +107,7 @@ func checkHttpStatus(res *http.Response) { log.Fatalln("Request failed with Status:", res.StatusCode) } } + +func cleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) + return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") +} From 004eebe53bf23476691dfc5e6b9b727e25861a53 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 11:45:35 +0900 Subject: [PATCH 04/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20encoding/csv=20pac?= =?UTF-8?q?kage=EB=A5=BC=20=EC=82=AC=EC=9A=A9=ED=95=B4=20csv=ED=8C=8C?= =?UTF-8?q?=EC=9D=BC=EC=97=90=20=EC=93=B0=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 259ed12..224ffa2 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -1,11 +1,13 @@ package main import ( + "encoding/csv" "fmt" "github.com/PuerkitoBio/goquery" "io" "log" "net/http" + "os" "strconv" "strings" ) @@ -16,7 +18,7 @@ type extractedJob struct { location string } -var baseURL string = "https://kr.indeed.com/jobs?q=python&limit=50" +var baseURL = "https://kr.indeed.com/jobs?q=python&limit=50" func main() { var jobs []extractedJob @@ -27,7 +29,21 @@ func main() { jobs = append(jobs, extractedJobs...) } - fmt.Println(jobs) + writeJobs(jobs) + fmt.Println("Done, extracted : ", len(jobs)) +} + +func writeJobs(jobs []extractedJob) { + file, err := os.Create("jobs.csv") + checkErr(err) + + w := csv.NewWriter(file) + defer w.Flush() + + headers := []string{"ID", "TITLE", "LOCATION"} + + writeErr := w.Write(headers) + checkErr(writeErr) } func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 From b8ba2bf3dd1daf8141aab39dace6241b1d9f46b9 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 12:02:13 +0900 Subject: [PATCH 05/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20=ED=95=9C=EA=B8=80?= =?UTF-8?q?=20=EC=9D=B8=EC=BD=94=EB=94=A9,=20=EC=8A=A4=ED=81=AC=EB=9E=A9?= =?UTF-8?q?=ED=95=B4=EC=98=A8=20job=EB=93=A4=EC=9D=84=20csv=20=ED=8C=8C?= =?UTF-8?q?=EC=9D=BC=EC=97=90=20=EC=93=B0=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sewon/main.go b/sewon/main.go index 224ffa2..3509ac7 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -37,13 +37,27 @@ func writeJobs(jobs []extractedJob) { file, err := os.Create("jobs.csv") checkErr(err) + // 한글 인코딩 + utf8bom := []byte{0xEF, 0xBB, 0xBF} + _, encodingErr := file.Write(utf8bom) + checkErr(encodingErr) + w := csv.NewWriter(file) - defer w.Flush() + defer w.Flush() // defer : 지연실행 headers := []string{"ID", "TITLE", "LOCATION"} writeErr := w.Write(headers) checkErr(writeErr) + + for _, job := range jobs { // _ : index, job : 요소 값 + jobSlice := []string{ + "https://kr.indeed.com/viewjob?jk=" + job.id, + job.title, + job.location} + jobWriteErr := w.Write(jobSlice) + checkErr(jobWriteErr) + } } func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 From 38b23d05dcc0cd58bae2917c222f906ca88a57dc Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 12:15:50 +0900 Subject: [PATCH 06/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20companyName=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 3509ac7..16ab0b7 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -13,9 +13,10 @@ import ( ) type extractedJob struct { - id string - title string - location string + id string + title string + companyName string + location string } var baseURL = "https://kr.indeed.com/jobs?q=python&limit=50" @@ -45,7 +46,7 @@ func writeJobs(jobs []extractedJob) { w := csv.NewWriter(file) defer w.Flush() // defer : 지연실행 - headers := []string{"ID", "TITLE", "LOCATION"} + headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} writeErr := w.Write(headers) checkErr(writeErr) @@ -54,6 +55,7 @@ func writeJobs(jobs []extractedJob) { jobSlice := []string{ "https://kr.indeed.com/viewjob?jk=" + job.id, job.title, + job.companyName, job.location} jobWriteErr := w.Write(jobSlice) checkErr(jobWriteErr) @@ -93,12 +95,14 @@ func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 func extractJob(selection *goquery.Selection) extractedJob { id, _ := selection.Attr("data-jk") title := cleanString(selection.Find("h2>span").Text()) + companyName := cleanString(selection.Find(".companyName").Text()) location := cleanString(selection.Find(".companyLocation").Text()) return extractedJob{ - id: id, - title: title, - location: location} + id: id, + title: title, + companyName: companyName, + location: location} } func getPages() int { // pagination 수 반환 From c3edfdc12627735380040ca4294ce5c3a9cc8248 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 14:45:24 +0900 Subject: [PATCH 07/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20execution=20time?= =?UTF-8?q?=20=EC=B8=A1=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sewon/main.go b/sewon/main.go index 16ab0b7..1db674a 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -10,6 +10,7 @@ import ( "os" "strconv" "strings" + "time" ) type extractedJob struct { @@ -21,7 +22,14 @@ type extractedJob struct { var baseURL = "https://kr.indeed.com/jobs?q=python&limit=50" +func timeTrack(start time.Time, name string) { + elapsed := time.Since(start) + log.Printf("%s took %s", name, elapsed) +} + func main() { + defer timeTrack(time.Now(), "JobScrapper") + var jobs []extractedJob totalPages := getPages() From 12137d0d33a1ed9980eb01bd35c9f0398fcc3942 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 14:59:55 +0900 Subject: [PATCH 08/13] =?UTF-8?q?[=EC=88=98=EC=A0=95]=20extractJob()?= =?UTF-8?q?=EC=97=90=20goroutine=20=EC=A0=81=EC=9A=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 1db674a..500715a 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -73,6 +73,8 @@ func writeJobs(jobs []extractedJob) { func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 var jobs []extractedJob + channel := make(chan extractedJob) // channel 생성 + pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) fmt.Println("Requesting: ", pageURL) @@ -93,20 +95,24 @@ func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 searchJobs := doc.Find(".tapItem") searchJobs.Each(func(i int, selection *goquery.Selection) { - job := extractJob(selection) - jobs = append(jobs, job) // slice + go extractJob(selection, channel) // goroutine 적용 }) + for i := 0; i < searchJobs.Length(); i++ { + job := <-channel + jobs = append(jobs, job) + } + return jobs } -func extractJob(selection *goquery.Selection) extractedJob { +func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only id, _ := selection.Attr("data-jk") title := cleanString(selection.Find("h2>span").Text()) companyName := cleanString(selection.Find(".companyName").Text()) location := cleanString(selection.Find(".companyLocation").Text()) - return extractedJob{ + channel <- extractedJob{ // send to channel id: id, title: title, companyName: companyName, From 346ea45cfb8a20672a9d433c605a506d3bbaec9e Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 15:34:33 +0900 Subject: [PATCH 09/13] =?UTF-8?q?[=EC=88=98=EC=A0=95]=20getPage()=EC=97=90?= =?UTF-8?q?=20goroutine=20=EC=A0=81=EC=9A=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 69 +++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 500715a..02c2078 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -31,46 +31,23 @@ func main() { defer timeTrack(time.Now(), "JobScrapper") var jobs []extractedJob + mainChannel := make(chan []extractedJob) totalPages := getPages() for i := 0; i < totalPages; i++ { - extractedJobs := getPage(i) - jobs = append(jobs, extractedJobs...) + go getPage(i, mainChannel) + } + + for i := 0; i < totalPages; i++ { + extractedJob := <-mainChannel + jobs = append(jobs, extractedJob...) } writeJobs(jobs) fmt.Println("Done, extracted : ", len(jobs)) } -func writeJobs(jobs []extractedJob) { - file, err := os.Create("jobs.csv") - checkErr(err) - - // 한글 인코딩 - utf8bom := []byte{0xEF, 0xBB, 0xBF} - _, encodingErr := file.Write(utf8bom) - checkErr(encodingErr) - - w := csv.NewWriter(file) - defer w.Flush() // defer : 지연실행 - - headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} - - writeErr := w.Write(headers) - checkErr(writeErr) - - for _, job := range jobs { // _ : index, job : 요소 값 - jobSlice := []string{ - "https://kr.indeed.com/viewjob?jk=" + job.id, - job.title, - job.companyName, - job.location} - jobWriteErr := w.Write(jobSlice) - checkErr(jobWriteErr) - } -} - -func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 +func getPage(pageNum int, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출 var jobs []extractedJob channel := make(chan extractedJob) // channel 생성 @@ -103,7 +80,7 @@ func getPage(pageNum int) []extractedJob { // pagination된 페이지 호출 jobs = append(jobs, job) } - return jobs + mainChannel <- jobs } func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only @@ -144,6 +121,34 @@ func getPages() int { // pagination 수 반환 return pages } +func writeJobs(jobs []extractedJob) { + file, err := os.Create("jobs.csv") + checkErr(err) + + // 한글 인코딩 + utf8bom := []byte{0xEF, 0xBB, 0xBF} + _, encodingErr := file.Write(utf8bom) + checkErr(encodingErr) + + w := csv.NewWriter(file) + defer w.Flush() // defer : 지연실행 + + headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} + + writeErr := w.Write(headers) + checkErr(writeErr) + + for _, job := range jobs { // _ : index, job : 요소 값 + jobSlice := []string{ + "https://kr.indeed.com/viewjob?jk=" + job.id, + job.title, + job.companyName, + job.location} + jobWriteErr := w.Write(jobSlice) + checkErr(jobWriteErr) + } +} + func checkErr(err error) { // 에러 처리 if err != nil { log.Fatalln(err) From b90a2fbab6c9d9a3817293c7a6f9fd7335b09dcd Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 16:16:38 +0900 Subject: [PATCH 10/13] =?UTF-8?q?[=EC=88=98=EC=A0=95]=20Job=20Scapper=20?= =?UTF-8?q?=ED=8C=A8=ED=82=A4=EC=A7=80=EB=A1=9C=20=EB=B9=BC=EB=91=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/scrapper/scrapper.go | 166 +++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 sewon/scrapper/scrapper.go diff --git a/sewon/scrapper/scrapper.go b/sewon/scrapper/scrapper.go new file mode 100644 index 0000000..7623794 --- /dev/null +++ b/sewon/scrapper/scrapper.go @@ -0,0 +1,166 @@ +package scrapper + +import ( + "encoding/csv" + "fmt" + "github.com/PuerkitoBio/goquery" + "io" + "log" + "net/http" + "os" + "strconv" + "strings" + "time" +) + +type extractedJob struct { + id string + title string + companyName string + location string +} + +func timeTrack(start time.Time, name string) { + elapsed := time.Since(start) + log.Printf("%s took %s", name, elapsed) +} + +func Scrape(term string) { + defer timeTrack(time.Now(), "JobScrapper") + + var baseURL = "https://kr.indeed.com/jobs?q=" + term + "&limit=50" + + var jobs []extractedJob + mainChannel := make(chan []extractedJob) + totalPages := getPages(baseURL) + + for i := 0; i < totalPages; i++ { + go getPage(i, baseURL, mainChannel) + } + + for i := 0; i < totalPages; i++ { + extractedJob := <-mainChannel + jobs = append(jobs, extractedJob...) + } + + writeJobs(jobs) + fmt.Println("Done, extracted : ", len(jobs)) +} + +func getPage(pageNum int, baseURL string, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출 + var jobs []extractedJob + + channel := make(chan extractedJob) // channel 생성 + + pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) + fmt.Println("Requesting: ", pageURL) + + res, err := http.Get(pageURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { // error handling for io.ReadCloser + err := Body.Close() + if err != nil { + + } + }(res.Body) + + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + searchJobs := doc.Find(".tapItem") + + searchJobs.Each(func(i int, selection *goquery.Selection) { + go extractJob(selection, channel) // goroutine 적용 + }) + + for i := 0; i < searchJobs.Length(); i++ { + job := <-channel + jobs = append(jobs, job) + } + + mainChannel <- jobs +} + +func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only + id, _ := selection.Attr("data-jk") + title := cleanString(selection.Find("h2>span").Text()) + companyName := cleanString(selection.Find(".companyName").Text()) + location := cleanString(selection.Find(".companyLocation").Text()) + + channel <- extractedJob{ // send to channel + id: id, + title: title, + companyName: companyName, + location: location} +} + +func getPages(baseURL string) int { // pagination 수 반환 + pages := 0 + res, err := http.Get(baseURL) + checkErr(err) + checkHttpStatus(res) + + defer func(Body io.ReadCloser) { + err := Body.Close() + if err != nil { + + } + }(res.Body) + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(res.Body) + checkErr(err) + + doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) { + // For each item found, count pages + pages = selection.Find("a").Length() // 태그 찾기 + }) + + return pages +} + +func writeJobs(jobs []extractedJob) { + file, err := os.Create("jobs.csv") + checkErr(err) + + // 한글 인코딩 + utf8bom := []byte{0xEF, 0xBB, 0xBF} + _, encodingErr := file.Write(utf8bom) + checkErr(encodingErr) + + w := csv.NewWriter(file) + defer w.Flush() // defer : 지연실행 + + headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} + + writeErr := w.Write(headers) + checkErr(writeErr) + + for _, job := range jobs { // _ : index, job : 요소 값 + jobSlice := []string{ + "https://kr.indeed.com/viewjob?jk=" + job.id, + job.title, + job.companyName, + job.location} + jobWriteErr := w.Write(jobSlice) + checkErr(jobWriteErr) + } +} + +func checkErr(err error) { // 에러 처리 + if err != nil { + log.Fatalln(err) + } +} + +func checkHttpStatus(res *http.Response) { + if res.StatusCode != 200 { + log.Fatalln("Request failed with Status:", res.StatusCode) + } +} + +func cleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) + return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") +} From 14ad05854e3d5bc4e884580fce1cf487fee0967f Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 17:15:08 +0900 Subject: [PATCH 11/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20=EA=B2=80=EC=83=89?= =?UTF-8?q?=20=ED=8E=98=EC=9D=B4=EC=A7=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/home.html | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 sewon/home.html diff --git a/sewon/home.html b/sewon/home.html new file mode 100644 index 0000000..94c50a2 --- /dev/null +++ b/sewon/home.html @@ -0,0 +1,17 @@ + + + + + + + Go Jobs + + +

Go Jobs

+

Indeed.com scrapper

+
+ + +
+ + \ No newline at end of file From 067ef00e3c23ac718cfc720769a8c300c37d4059 Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 17:16:06 +0900 Subject: [PATCH 12/13] =?UTF-8?q?[=EC=B6=94=EA=B0=80]=20echo=20server=20GE?= =?UTF-8?q?T,=20POST?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/main.go | 168 +++++--------------------------------------------- 1 file changed, 15 insertions(+), 153 deletions(-) diff --git a/sewon/main.go b/sewon/main.go index 02c2078..6ba05ec 100644 --- a/sewon/main.go +++ b/sewon/main.go @@ -1,166 +1,28 @@ package main import ( - "encoding/csv" - "fmt" - "github.com/PuerkitoBio/goquery" - "io" - "log" - "net/http" + "github.com/labstack/echo" + "github.com/sw-develop/learngo/golang_study/sewon/scrapper" "os" - "strconv" "strings" - "time" ) -type extractedJob struct { - id string - title string - companyName string - location string -} - -var baseURL = "https://kr.indeed.com/jobs?q=python&limit=50" +const fileName string = "jobs.csv" -func timeTrack(start time.Time, name string) { - elapsed := time.Since(start) - log.Printf("%s took %s", name, elapsed) +func handleHome(c echo.Context) error { + return c.File("home.html") } -func main() { - defer timeTrack(time.Now(), "JobScrapper") - - var jobs []extractedJob - mainChannel := make(chan []extractedJob) - totalPages := getPages() - - for i := 0; i < totalPages; i++ { - go getPage(i, mainChannel) - } - - for i := 0; i < totalPages; i++ { - extractedJob := <-mainChannel - jobs = append(jobs, extractedJob...) - } - - writeJobs(jobs) - fmt.Println("Done, extracted : ", len(jobs)) +func handleScrape(c echo.Context) error { + defer os.Remove(fileName) + term := strings.ToLower(scrapper.CleanString(c.FormValue("term"))) + scrapper.Scrape(term) // Scrape() 실행 + return c.Attachment(fileName, fileName) } -func getPage(pageNum int, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출 - var jobs []extractedJob - - channel := make(chan extractedJob) // channel 생성 - - pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50) - fmt.Println("Requesting: ", pageURL) - - res, err := http.Get(pageURL) - checkErr(err) - checkHttpStatus(res) - - defer func(Body io.ReadCloser) { // error handling for io.ReadCloser - err := Body.Close() - if err != nil { - - } - }(res.Body) - - doc, err := goquery.NewDocumentFromReader(res.Body) - checkErr(err) - - searchJobs := doc.Find(".tapItem") - - searchJobs.Each(func(i int, selection *goquery.Selection) { - go extractJob(selection, channel) // goroutine 적용 - }) - - for i := 0; i < searchJobs.Length(); i++ { - job := <-channel - jobs = append(jobs, job) - } - - mainChannel <- jobs -} - -func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only - id, _ := selection.Attr("data-jk") - title := cleanString(selection.Find("h2>span").Text()) - companyName := cleanString(selection.Find(".companyName").Text()) - location := cleanString(selection.Find(".companyLocation").Text()) - - channel <- extractedJob{ // send to channel - id: id, - title: title, - companyName: companyName, - location: location} -} - -func getPages() int { // pagination 수 반환 - pages := 0 - res, err := http.Get(baseURL) - checkErr(err) - checkHttpStatus(res) - - defer func(Body io.ReadCloser) { - err := Body.Close() - if err != nil { - - } - }(res.Body) - - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(res.Body) - checkErr(err) - - doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) { - // For each item found, count pages - pages = selection.Find("a").Length() //
태그 찾기 - }) - - return pages -} - -func writeJobs(jobs []extractedJob) { - file, err := os.Create("jobs.csv") - checkErr(err) - - // 한글 인코딩 - utf8bom := []byte{0xEF, 0xBB, 0xBF} - _, encodingErr := file.Write(utf8bom) - checkErr(encodingErr) - - w := csv.NewWriter(file) - defer w.Flush() // defer : 지연실행 - - headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"} - - writeErr := w.Write(headers) - checkErr(writeErr) - - for _, job := range jobs { // _ : index, job : 요소 값 - jobSlice := []string{ - "https://kr.indeed.com/viewjob?jk=" + job.id, - job.title, - job.companyName, - job.location} - jobWriteErr := w.Write(jobSlice) - checkErr(jobWriteErr) - } -} - -func checkErr(err error) { // 에러 처리 - if err != nil { - log.Fatalln(err) - } -} - -func checkHttpStatus(res *http.Response) { - if res.StatusCode != 200 { - log.Fatalln("Request failed with Status:", res.StatusCode) - } -} - -func cleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) - return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") +func main() { + e := echo.New() + e.GET("/", handleHome) + e.POST("/scrape", handleScrape) + e.Logger.Fatal(e.Start(":1323")) } From 05152c954bba607db9fd231fd0369c5a00f316bf Mon Sep 17 00:00:00 2001 From: sw-develop Date: Tue, 12 Oct 2021 17:16:38 +0900 Subject: [PATCH 13/13] =?UTF-8?q?[=EC=88=98=EC=A0=95]=20private=20method?= =?UTF-8?q?=20to=20public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sewon/scrapper/scrapper.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sewon/scrapper/scrapper.go b/sewon/scrapper/scrapper.go index 7623794..fd2cb66 100644 --- a/sewon/scrapper/scrapper.go +++ b/sewon/scrapper/scrapper.go @@ -85,9 +85,9 @@ func getPage(pageNum int, baseURL string, mainChannel chan<- []extractedJob) { / func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only id, _ := selection.Attr("data-jk") - title := cleanString(selection.Find("h2>span").Text()) - companyName := cleanString(selection.Find(".companyName").Text()) - location := cleanString(selection.Find(".companyLocation").Text()) + title := CleanString(selection.Find("h2>span").Text()) + companyName := CleanString(selection.Find(".companyName").Text()) + location := CleanString(selection.Find(".companyLocation").Text()) channel <- extractedJob{ // send to channel id: id, @@ -161,6 +161,6 @@ func checkHttpStatus(res *http.Response) { } } -func cleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) +func CleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과) return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") }