-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.cpp
234 lines (208 loc) · 7.42 KB
/
crawl.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
//============================================================================
// Name : OctaneCrawler.cpp
// Author : Berlin Brown (berlin dot brown at gmail.com)
// Version :
// Copyright : Copyright Berlin Brown 2012-2013
// License : BSD
// Description : This is the simplest possible web crawler in C++
// Uses boost_regex and boost_algorithm
//============================================================================
#include <iostream>
#include <string>
#include <typeinfo>
#include <cstdarg>
#include <iostream>
#include <fstream>
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <netdb.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
using namespace std;
using namespace boost;
const int DELAY = 12;
const int MAXRECV = 140 * 1024;
const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store";
class WebPage {
public:
std::string hostname;
std::string page;
WebPage() {
hostname = "";
page = "";
}
std::string parseHttp(const std::string str) {
const boost::regex re("(?i)http://(.*)/?(.*)");
boost::smatch what;
if (boost::regex_match(str, what, re)) {
std::string hst = what[1];
boost::algorithm::to_lower(hst);
return hst;
}
return "";
} // End of method //
void parseHref(const std::string orig_host, const std::string str) {
const boost::regex re("(?i)http://(.*)/(.*)");
boost::smatch what;
if (boost::regex_match(str, what, re)) {
// We found a full URL, parse out the 'hostname'
// Then parse out the page
hostname = what[1];
boost::algorithm::to_lower(hostname);
page = what[2];
} else {
// We could not find the 'page' but we can build the hostname
hostname = orig_host;
page = "";
} // End of the if - else //
} // End of method //
void parse(const std::string orig_host, const std::string hrf) {
const std::string hst = parseHttp(hrf);
if (!hst.empty()) {
// If we have a HTTP prefix
// We could end up with a 'hostname' and page
parseHref(hst, hrf);
} else {
hostname = orig_host;
page = hrf;
}
// hostname and page are constructed,
// perform post analysis
if (page.length() == 0) {
page = "/";
} // End of the if //
} // End of the method
}; // End of the class
std::string string_format(const std::string &fmt, ...) {
int size = 255;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap);
va_end(ap);
if (n > -1 && n < size) {
str.resize(n);
return str;
}
if (n > -1)
size = n + 1;
else
size *= 2;
} // End of the while //
return str;
} // End of the function //
std::string request(std::string host, std::string path) {
std::string request = "GET ";
request.append(path);
request.append(" HTTP/1.1\r\n");
request.append("Host: ");
request.append(host);
request.append("\r\n");
request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n");
request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n");
request.append("Connection: close\r\n");
request.append("\r\n");
return request;
} // End of the function //
std::string clean_href(const std::string host, const std::string path) {
// Clean the href to save to file //
std::string full_url = host;
full_url.append("/");
full_url.append(path);
const boost::regex rmv_all("[^a-zA-Z0-9]");
const std::string s2 = boost::regex_replace(full_url, rmv_all, "_");
cout << s2 << endl;
return s2;
}
int connect(const std::string host, const std::string path) {
const int port = 80;
// Setup the msock
int m_sock;
sockaddr_in m_addr;
memset(&m_addr, 0, sizeof(m_addr));
m_sock = socket(AF_INET, SOCK_STREAM, 0);
int on = 1;
if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) {
return false;
}
// Connect //
m_addr.sin_family = AF_INET;
m_addr.sin_port = htons(port);
int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr);
if (errno == EAFNOSUPPORT) {
return false;
}
status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr));
// HTTP/1.1 defines the "close" connection option for
// the sender to signal that the connection will be closed
// after completion of the response.
std::string req = request(host, path);
// End of building the request //
status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL);
char buf[MAXRECV];
cout << "Request: " << req << endl;
cout << "=========================" << endl;
std::string recv = "";
while (status != 0) {
memset(buf, 0, MAXRECV);
status = ::recv(m_sock, buf, MAXRECV, 0);
recv.append(buf);
} // End of the while //
cout << "Response:" << recv << endl;
cout << "---------------------------" << endl;
// Attempt to write to file //
const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str());
cout << "Writing to file : " << html_file_write << endl;
ofstream outfile(html_file_write.c_str());
outfile << recv << endl;
outfile.close();
// Parse the data //
try {
const boost::regex rmv_all("[\\r|\\n]");
const std::string s2 = boost::regex_replace(recv, rmv_all, "");
const std::string s = s2;
// Use this regex expression, allow for mixed-case
// Search for the anchor tag but not the '>'
// Where (.+?) match anything
//const boost::regex re("<a([^>]+) href='(.+?)'>");
const boost::regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>");
boost::cmatch matches;
// Using token iterator with sub-matches
const int subs[] = { 2, 4 };
boost::sregex_token_iterator i(s.begin(), s.end(), re, subs);
boost::sregex_token_iterator j;
for (; i != j; i++) {
// Iterate through the listed HREFs and
// move to next request //
const std::string href = *i;
if (href.length() != 0) {
WebPage* page = new WebPage();
page->parse(host, href);
const char* hrefc = page->page.c_str();
cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl;
sleep(DELAY);
connect(page->hostname, string_format("/%s", hrefc));
delete page;
} // End of the if ///
} // End of the for //
} catch (boost::regex_error& e) {
cout << "Error: " << e.what() << "\n";
} // End of the try - catch //
return 1;
} // End of the function //
int main() {
cout << "Launching program" << endl;
connect("localhost", "/");
cout << "Done" << endl;
return 0;
} // End of the function //