diff --git a/sewon/home.html b/sewon/home.html
new file mode 100644
index 0000000..94c50a2
--- /dev/null
+++ b/sewon/home.html
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+ Go Jobs
+
+
+Go Jobs
+Indeed.com scrapper
+
+
+
\ No newline at end of file
diff --git a/sewon/main.go b/sewon/main.go
index 90c6a8a..6ba05ec 100644
--- a/sewon/main.go
+++ b/sewon/main.go
@@ -1,42 +1,28 @@
-package main // 어떤 패키지를 사용하는지 명시해줌, main.go 파일의 경우 컴파일을 위해서 필요한 것임(필수)
+package main
+
import (
- "errors"
- "fmt"
- "net/http"
+ "github.com/labstack/echo"
+ "github.com/sw-develop/learngo/golang_study/sewon/scrapper"
+ "os"
+ "strings"
)
-var errRequestFailed = errors.New("request failed")
+const fileName string = "jobs.csv"
-func main() {
- var results = make(map[string]string) // map 생성 및 초기화, make() : built-in func
- urls := []string{
- "https://www.airbnb.com/",
- "https://www.google.com/",
- "https://www.amazon.com/",
- "https://www.reddit.com/",
- "https://www.google.com/",
- "https://soundcloud.com/",
- "https://www.facebook.com/",
- "https://www.instagram.com/",
- }
- for _, url := range urls {
- result := "OK"
- err := hitURL(url)
- if err != nil {
- result = "FAILED"
- }
- results[url] = result
- }
- for url, result := range results {
- fmt.Println(url, result)
- }
+func handleHome(c echo.Context) error {
+ return c.File("home.html")
}
-func hitURL(url string) error {
- fmt.Println("Checking: ", url)
- res, err := http.Get(url)
- if err != nil || res.StatusCode >= 400 {
- return errRequestFailed
- }
- return nil
+func handleScrape(c echo.Context) error {
+ defer os.Remove(fileName)
+ term := strings.ToLower(scrapper.CleanString(c.FormValue("term")))
+ scrapper.Scrape(term) // Scrape() 실행
+ return c.Attachment(fileName, fileName)
+}
+
+func main() {
+ e := echo.New()
+ e.GET("/", handleHome)
+ e.POST("/scrape", handleScrape)
+ e.Logger.Fatal(e.Start(":1323"))
}
diff --git a/sewon/scrapper/scrapper.go b/sewon/scrapper/scrapper.go
new file mode 100644
index 0000000..fd2cb66
--- /dev/null
+++ b/sewon/scrapper/scrapper.go
@@ -0,0 +1,166 @@
+package scrapper
+
+import (
+ "encoding/csv"
+ "fmt"
+ "github.com/PuerkitoBio/goquery"
+ "io"
+ "log"
+ "net/http"
+ "os"
+ "strconv"
+ "strings"
+ "time"
+)
+
+type extractedJob struct {
+ id string
+ title string
+ companyName string
+ location string
+}
+
+func timeTrack(start time.Time, name string) {
+ elapsed := time.Since(start)
+ log.Printf("%s took %s", name, elapsed)
+}
+
+func Scrape(term string) {
+ defer timeTrack(time.Now(), "JobScrapper")
+
+ var baseURL = "https://kr.indeed.com/jobs?q=" + term + "&limit=50"
+
+ var jobs []extractedJob
+ mainChannel := make(chan []extractedJob)
+ totalPages := getPages(baseURL)
+
+ for i := 0; i < totalPages; i++ {
+ go getPage(i, baseURL, mainChannel)
+ }
+
+ for i := 0; i < totalPages; i++ {
+ extractedJob := <-mainChannel
+ jobs = append(jobs, extractedJob...)
+ }
+
+ writeJobs(jobs)
+ fmt.Println("Done, extracted : ", len(jobs))
+}
+
+func getPage(pageNum int, baseURL string, mainChannel chan<- []extractedJob) { // pagination된 페이지 호출
+ var jobs []extractedJob
+
+ channel := make(chan extractedJob) // channel 생성
+
+ pageURL := baseURL + "&start=" + strconv.Itoa(pageNum*50)
+ fmt.Println("Requesting: ", pageURL)
+
+ res, err := http.Get(pageURL)
+ checkErr(err)
+ checkHttpStatus(res)
+
+ defer func(Body io.ReadCloser) { // error handling for io.ReadCloser
+ err := Body.Close()
+ if err != nil {
+
+ }
+ }(res.Body)
+
+ doc, err := goquery.NewDocumentFromReader(res.Body)
+ checkErr(err)
+
+ searchJobs := doc.Find(".tapItem")
+
+ searchJobs.Each(func(i int, selection *goquery.Selection) {
+ go extractJob(selection, channel) // goroutine 적용
+ })
+
+ for i := 0; i < searchJobs.Length(); i++ {
+ job := <-channel
+ jobs = append(jobs, job)
+ }
+
+ mainChannel <- jobs
+}
+
+func extractJob(selection *goquery.Selection, channel chan<- extractedJob) { // <- : send only
+ id, _ := selection.Attr("data-jk")
+ title := CleanString(selection.Find("h2>span").Text())
+ companyName := CleanString(selection.Find(".companyName").Text())
+ location := CleanString(selection.Find(".companyLocation").Text())
+
+ channel <- extractedJob{ // send to channel
+ id: id,
+ title: title,
+ companyName: companyName,
+ location: location}
+}
+
+func getPages(baseURL string) int { // pagination 수 반환
+ pages := 0
+ res, err := http.Get(baseURL)
+ checkErr(err)
+ checkHttpStatus(res)
+
+ defer func(Body io.ReadCloser) {
+ err := Body.Close()
+ if err != nil {
+
+ }
+ }(res.Body)
+
+ // Load the HTML document
+ doc, err := goquery.NewDocumentFromReader(res.Body)
+ checkErr(err)
+
+ doc.Find(".pagination").Each(func(i int, selection *goquery.Selection) {
+ // For each item found, count pages
+ pages = selection.Find("a").Length() // 태그 찾기
+ })
+
+ return pages
+}
+
+func writeJobs(jobs []extractedJob) {
+ file, err := os.Create("jobs.csv")
+ checkErr(err)
+
+ // 한글 인코딩
+ utf8bom := []byte{0xEF, 0xBB, 0xBF}
+ _, encodingErr := file.Write(utf8bom)
+ checkErr(encodingErr)
+
+ w := csv.NewWriter(file)
+ defer w.Flush() // defer : 지연실행
+
+ headers := []string{"LINK", "TITLE", "COMPANY_NAME", "LOCATION"}
+
+ writeErr := w.Write(headers)
+ checkErr(writeErr)
+
+ for _, job := range jobs { // _ : index, job : 요소 값
+ jobSlice := []string{
+ "https://kr.indeed.com/viewjob?jk=" + job.id,
+ job.title,
+ job.companyName,
+ job.location}
+ jobWriteErr := w.Write(jobSlice)
+ checkErr(jobWriteErr)
+ }
+}
+
+func checkErr(err error) { // 에러 처리
+ if err != nil {
+ log.Fatalln(err)
+ }
+}
+
+func checkHttpStatus(res *http.Response) {
+ if res.StatusCode != 200 {
+ log.Fatalln("Request failed with Status:", res.StatusCode)
+ }
+}
+
+func CleanString(str string) string { // ex) a: b: c: -> "a:", "b:", "c:" -> a: b: c: (Join()의 결과)
+ return strings.Join(strings.Fields(strings.TrimSpace(str)), " ")
+}