ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/HttpHandler.cpp
Revision: 25
Committed: 2002-12-22T23:32:58-08:00 (22 years, 6 months ago) by douglas
File size: 12714 byte(s)
Log Message:
Added "referer" handling to Indexer and HttpHandler.handle().

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine HTTP Handler
46 //
47 // Douglas Thrift
48 //
49 // HttpHandler.cpp
50
51 #include "HttpHandler.h"
52
53 HttpHandler::HttpHandler()
54 {
55 buffer = new char[BUFSIZ + 1];
56
57 #ifdef _WIN32
58 if (WSAStartup(MAKEWORD(2, 0), &data) != 0)
59 {
60 error(program + ": WSAStartup");
61 exit(1);
62 }
63 #endif // _WIN32
64
65 length = 0;
66 chunked = false;
67 }
68
69 HttpHandler::~HttpHandler()
70 {
71 delete [] buffer;
72
73 #ifdef _WIN32
74 WSACleanup();
75 #endif // _WIN32
76 }
77
78 bool HttpHandler::handle(URL &url, const string referer, bool head)
79 {
80 bool answer = false;
81
82 if ((http = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET)
83 {
84 error(program + ": Socket");
85 exit(1);
86 }
87
88 sockaddr_in address;
89 hostent* host;
90
91 address.sin_family = AF_INET;
92
93 if ((host = gethostbyname(url.getAddress().c_str())) == NULL)
94 {
95 error(program + ": Host: " + url.getAddress(), true);
96 return answer;
97 }
98
99 address.sin_addr = *((in_addr*)*host->h_addr_list);
100 address.sin_port = htons(url.getPort());
101
102 if (connect(http, (sockaddr*)&address, sizeof(sockaddr_in)) ==
103 SOCKET_ERROR)
104 {
105 error(program + ": Connect");
106 return answer;
107 }
108
109 if (head)
110 {
111 putline("HEAD " + url.getPath() + " HTTP/1.1");
112 }
113 else
114 {
115 putline("GET " + url.getPath() + " HTTP/1.1");
116 }
117
118 putline("Accept: text/html; text/plain");
119 putline("User-Agent: " + agent(true) + ' ' + platform());
120
121 if (url.getPort() == 80)
122 {
123 putline("Host: " + url.getAddress());
124 }
125 else
126 {
127 char* port = new char[1024];
128 sprintf(port, "%u", url.getPort());
129
130 putline("Host: " + url.getAddress() + ':' + port);
131
132 delete [] port;
133 }
134
135 if (referer != "")
136 {
137 putline("Referer: " + referer);
138 }
139
140 putline("Connection: close");
141 putline();
142
143 code response;
144 string line;
145
146 do
147 {
148 line = getline();
149
150 if (line.find("HTTP/") != 0)
151 {
152 return answer;
153 }
154
155 unsigned dot = line.find('.');
156 unsigned space = line.find(' ');
157
158 unsigned major = strtoul(line.substr(5, dot - 5).c_str(), 0, 10);
159 unsigned minor = strtoul(line.substr(dot + 1, space - dot - 1).c_str(),
160 0, 10);
161
162 if (major > 1)
163 {
164 cerr << program << ": Potentially Incompatible Server: HTTP/" <<
165 major << "." << minor << "\n";
166
167 return answer;
168 }
169
170 response = code(strtoul(line.substr(space + 1).c_str(), 0, 10));
171
172 if (response < ok) do line = getline(); while (line != "");
173 }
174 while (response < ok);
175
176 do
177 {
178 line = getline();
179
180 if (line != "")
181 {
182 unsigned colon = line.find(':');
183
184 string field = line.substr(0, colon);
185 string value = line.substr(colon + 1);
186
187 while (isspace(value[0])) value.erase(0, 1);
188
189 if (field == "Content-Type")
190 {
191 type = value;
192 }
193 else if (field == "Content-Length")
194 {
195 length = strtoul(value.c_str(), 0, 10);
196 }
197 else if (field == "Location")
198 {
199 location = value;
200 }
201 else if (field == "Transfer-Encoding")
202 {
203 chunked = value == "chunked";
204 }
205 }
206 }
207 while (line != "");
208
209 switch (response)
210 {
211 case ok:
212 if (debug) cerr << "response = " << response << "\n";
213 answer = true;
214 break;
215 case choices:
216 case moved:
217 case found:
218 if (debug) cerr << "response = " << response << "\n"
219 << "location = " << location << "\n";
220 location = getLink(location, url);
221 break;
222 case notfound:
223 case internal:
224 if (debug) cerr << "response = " << response << "\n";
225 break;
226 default:
227 if (debug) cerr << "response = " << response << "\n";
228 if (response <= 299)
229 {
230 answer = true;
231 }
232 else if (response <= 399)
233 {
234 location = getLink(location, url);
235 }
236 break;
237 }
238
239 if (!head && answer) populate();
240
241 return answer;
242 }
243
244 HttpHandler& HttpHandler::getline(string& line, char endline)
245 {
246 unsigned end = page.find(endline);
247 unsigned newline = page.find('\n');
248
249 if (newline < end || end == string::npos)
250 {
251 end = newline;
252 }
253
254 line = page.substr(0, end);
255 page.erase(0, (end == string::npos ? end : end + 1));
256
257 return *this;
258 }
259
260 void HttpHandler::clear()
261 {
262 closesocket(http);
263
264 type = "";
265 length = 0;
266 location = "";
267 page = "";
268 chunked = false;
269 }
270
271 void HttpHandler::populate()
272 {
273 if (!chunked)
274 {
275 unsigned left = length;
276
277 while (left > 0)
278 {
279 memset(buffer, 0, BUFSIZ + 1);
280
281 unsigned bytes = left > BUFSIZ ? BUFSIZ : left;
282 unsigned received;
283
284 if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR)
285 {
286 error(program + ": Recv");
287 exit(1);
288 }
289 else if (received != bytes)
290 {
291 left -= received;
292 page += buffer;
293
294 memset(buffer, 0, BUFSIZ + 1);
295
296 bytes -= received;
297 if (recv(http, buffer, bytes, 0) == SOCKET_ERROR)
298 {
299 error(program + ": Recv");
300 exit(1);
301 }
302 }
303
304 page += buffer;
305 left -= bytes;
306 }
307 }
308 else
309 {
310 unsigned chunk;
311
312 do
313 {
314 chunk = strtoul(getline().c_str(), 0, 16);
315
316 unsigned left = chunk;
317
318 while (left > 0)
319 {
320 memset(buffer, 0, BUFSIZ + 1);
321
322 unsigned bytes = left > BUFSIZ ? BUFSIZ : left;
323 unsigned received;
324
325 if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR)
326 {
327 error(program + ": Recv");
328 exit(1);
329 }
330 else if (received != bytes)
331 {
332 left -= received;
333 page += buffer;
334
335 memset(buffer, 0, BUFSIZ + 1);
336
337 bytes -= received;
338 if (recv(http, buffer, bytes, 0) == SOCKET_ERROR)
339 {
340 error(program + ": Recv");
341 exit(1);
342 }
343 }
344
345 page += buffer;
346 left -= bytes;
347 }
348
349 getline();
350 length += chunk;
351 }
352 while (chunk > 0);
353 }
354
355 for (unsigned index = 0; index < page.length(); index++)
356 {
357 if (page[index] == '\r' && (index + 1 < page.length()) ? page[index +
358 1] == '\n' : false)
359 {
360 page.erase(index, 1);
361 }
362 else if (page[index] == '\r')
363 {
364 page[index] = '\n';
365 }
366 }
367 }
368
369 void HttpHandler::putline(const string line)
370 {
371 sprintf(buffer, "%s\r\n", line.c_str());
372 if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR)
373 {
374 error(program + ": Send");
375 exit(1);
376 }
377 }
378
379 string HttpHandler::getline()
380 {
381 string line;
382 char byte;
383
384 do
385 {
386 if (recv(http, &byte, 1, 0) == SOCKET_ERROR)
387 {
388 error(program + ": Recv");
389 }
390
391 if (byte != '\r' && byte != '\n')
392 {
393 line += byte;
394 }
395 }
396 while (byte != '\n');
397
398 return line;
399 }
400
401 void HttpHandler::error(const string& prefix, bool host)
402 {
403 #ifdef _WIN32
404 string error;
405
406 switch (WSAGetLastError())
407 {
408 case WSAEACCES:
409 error = "Permission denied.";
410 break;
411 case WSAEADDRINUSE:
412 error = "Address already in use.";
413 break;
414 case WSAEADDRNOTAVAIL:
415 error = "Cannot assign requested address.";
416 break;
417 case WSAEAFNOSUPPORT:
418 error = "Address family not supported by protocol family.";
419 break;
420 case WSAEALREADY:
421 error = "Operation already in progress.";
422 break;
423 case WSAECONNABORTED:
424 error = "Software caused connection abort.";
425 break;
426 case WSAECONNREFUSED:
427 error = "Connection refused.";
428 break;
429 case WSAECONNRESET:
430 error = "Connection reset by peer.";
431 break;
432 case WSAEDESTADDRREQ:
433 error = "Destination address required.";
434 break;
435 case WSAEFAULT:
436 error = "Bad address.";
437 break;
438 case WSAEHOSTDOWN:
439 error = "Host is down.";
440 break;
441 case WSAEHOSTUNREACH:
442 error = "No route to host.";
443 break;
444 case WSAEINPROGRESS:
445 error = "Operation now in progress.";
446 break;
447 case WSAEINTR:
448 error = "Interrupted function call.";
449 break;
450 case WSAEINVAL:
451 error = "Invalid argument.";
452 break;
453 case WSAEISCONN:
454 error = "Socket is already connected.";
455 break;
456 case WSAEMFILE:
457 error = "Too many open files.";
458 break;
459 case WSAEMSGSIZE:
460 error = "Message too long.";
461 break;
462 case WSAENETDOWN:
463 error = "Network is down.";
464 break;
465 case WSAENETRESET:
466 error = "Network dropped connection on reset.";
467 break;
468 case WSAENETUNREACH:
469 error = "Network is unreachable.";
470 break;
471 case WSAENOBUFS:
472 error = "No buffer space available.";
473 break;
474 case WSAENOPROTOOPT:
475 error = "Bad protocol option.";
476 break;
477 case WSAENOTCONN:
478 error = "Socket is not connected.";
479 break;
480 case WSAENOTSOCK:
481 error = "Socket operation on non-socket.";
482 break;
483 case WSAEOPNOTSUPP:
484 error = "Operation not supported.";
485 break;
486 case WSAEPFNOSUPPORT:
487 error = "Protocol family not supported.";
488 break;
489 case WSAEPROCLIM:
490 error = "Too many processes.";
491 break;
492 case WSAEPROTONOSUPPORT:
493 error = "Protocol not supported.";
494 break;
495 case WSAEPROTOTYPE:
496 error = "Protocol wrong type for socket.";
497 break;
498 case WSAESHUTDOWN:
499 error = "Cannot send after socket shutdown.";
500 break;
501 case WSAESOCKTNOSUPPORT:
502 error = "Socket type not supported.";
503 break;
504 case WSAETIMEDOUT:
505 error = "Connection timed out.";
506 break;
507 case WSATYPE_NOT_FOUND:
508 error = "Class type not found.";
509 break;
510 case WSAEWOULDBLOCK:
511 error = "Resource temporarily unavailable.";
512 break;
513 case WSAHOST_NOT_FOUND:
514 error = "Host not found.";
515 break;
516 case WSA_INVALID_HANDLE:
517 error = "Specified event object handle is invalid.";
518 break;
519 case WSA_INVALID_PARAMETER:
520 error = "One or more parameters are invalid.";
521 break;
522 // case WSAINVALIDPROCTABLE:
523 // error = "Invalid procedure table from service provider.";
524 // break;
525 // case WSAINVALIDPROVIDER:
526 // error = "Invalid service provider version number.";
527 // break;
528 case WSA_IO_INCOMPLETE:
529 error = "Overlapped I/O event object not in signaled state.";
530 break;
531 case WSA_IO_PENDING:
532 error = "Overlapped operations will complete later.";
533 break;
534 case WSA_NOT_ENOUGH_MEMORY:
535 error = "Insufficient memory available.";
536 break;
537 case WSANOTINITIALISED:
538 error = "Successful WSAStartup not yet performed.";
539 break;
540 case WSANO_DATA:
541 error = "Valid name, no data record of requested type.";
542 break;
543 case WSANO_RECOVERY:
544 error = "This is a non-recoverable error.";
545 break;
546 // case WSAPROVIDERFAILEDINIT:
547 // error = "Unable to initialize a service provider.";
548 // break;
549 case WSASYSCALLFAILURE:
550 error = "System call failure.";
551 break;
552 case WSASYSNOTREADY:
553 error = "Network subsystem is unavailable.";
554 break;
555 case WSATRY_AGAIN:
556 error = "Non-authoritative host not found.";
557 break;
558 case WSAVERNOTSUPPORTED:
559 error = "WINSOCK.DLL version out of range.";
560 break;
561 case WSAEDISCON:
562 error = "Graceful shutdown in progress.";
563 break;
564 case WSA_OPERATION_ABORTED:
565 error = "Overlapped operation aborted.";
566 break;
567 default:
568 error = "Unknown error.";
569 break;
570 }
571
572 cerr << prefix << ": " << error << "\n";
573 #else
574 if (host)
575 {
576 herror(prefix.c_str());
577 }
578 else
579 {
580 perror(prefix.c_str());
581 }
582 #endif // _WIN32
583 }