-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraperBund.R
104 lines (76 loc) · 2.76 KB
/
scraperBund.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Libraries ---------------------------------------------------------------
library(rvest) # for scraping
library(reshape2) # to turn list into data.frame object
url <- read_html(paste0("http://www.bund.de/Content/DE/Stellen/Suche/Formular.",
"html?nn=4642046&view=processForm&resultsPerPage=100"))
# Get number of pages to navigate through ---------------------------------
pages <- rvest::html_nodes(url, ".item-m p") %>%
html_text(trim = T) %>%
.[1] %>%
gsub("1 von ", "", .) %>%
as.numeric(.)
# Extract all urls --------------------------------------------------------
urls <- list()
for(i in rep(1:pages)){
# progress
cat('Processing page', i, 'of', length(rep(1:pages)),'\n')
urls[[i]] <- read_html(paste0("http://www.bund.de/Content/DE/Stellen/",
"Suche/Formular.html?nn=4642046>p=4642266_list%253D",i,
"&view=processForm&resultsPerPage=100")) %>%
html_nodes(., ".result-list li") %>%
html_nodes(xpath = "./a") %>%
html_attr("href") %>%
gsub("IMPORTE", "www.bund.de/IMPORTE", .)
}
# Extract Content ---------------------------------------------------------
jobs <- list()
for(i in 1:length(urls)) {
# progress
cat('Processing page', i, 'of', length(urls),'\n')
for(j in 1:length(urls[[i]])) {
# progress
cat('Processing url', j, 'of', length(urls[[i]]),'\n')
# key info
fields <- html_session(urls[[i]][j]) %>%
jump_to("") %>%
html_nodes(., "dt") %>%
html_text(trim = T)
content <- html_session(urls[[i]][j]) %>%
jump_to("") %>%
html_nodes(., "dd") %>%
html_text(trim = T) %>%
gsub("\n", " ", .) %>%
gsub("\t", "", .) %>%
gsub("Karte anschauen", "",.) %>%
gsub(" ", "",.)
df <- data.frame(t(content), stringsAsFactors = F)
names(df) <- fields
# Jobtitle
title <- html_session(urls[[i]][j]) %>%
jump_to("") %>%
html_nodes(., "img+ h1") %>%
html_text(trim = T)
ifelse(identical(title, character(0)),
title <- html_session(urls[[i]][j]) %>%
jump_to("") %>%
html_nodes(., "a+ h1") %>%
html_text(trim = T) %>%
gsub("\n", "",.),
title)
df$title <- title
# job description
description <- html_session(urls[[i]][j]) %>%
jump_to("") %>%
html_nodes(., ".text section p") %>%
html_text(trim = T) %>%
gsub("\t", "", .) %>%
paste(., collapse = "\n")
df$description <- description
# add df to biglist
jobs[[urls[[i]][j]]] <- df
}
}
# Convert list into data.frame --------------------------------------------
jobsDf <- melt(jobs)
# Save as csv -------------------------------------------------------------
write.csv(jobsDf, file = "jobsBund.csv")