/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine HTTP Handler // // Douglas Thrift // // $Id: HttpHandler.cpp,v 1.23 2003/07/20 04:00:25 douglas Exp $ #include "HttpHandler.h" // Lovely C Sockets! #ifndef _WIN32 // BSD Sockets #include #include #include #include #include inline int closesocket(SOCKET s) { return close(s); } #endif HttpHandler::HttpHandler() { buffer = new char[BUFSIZ + 1]; #ifdef _WIN32 if (WSAStartup(MAKEWORD(2, 0), &data) != 0) { error(program + ": WSAStartup"); exit(1); } #endif // _WIN32 binary = false; length = 0; chunked = false; #ifdef _OpenSSL_ tls = false; #endif } HttpHandler::~HttpHandler() { delete [] buffer; #ifdef _WIN32 WSACleanup(); #endif // _WIN32 } bool HttpHandler::handle(URL &url, const string referer, bool head) { bool answer = false; if ((http = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) { error(program + ": Socket"); exit(1); } sockaddr_in address; hostent* host; address.sin_family = AF_INET; if ((host = gethostbyname(url.getAddress().c_str())) == NULL) { error(program + ": Host: " + url.getAddress(), true); return answer; } address.sin_addr = *((in_addr*)*host->h_addr_list); address.sin_port = htons(url.getPort()); if (connect(http, (sockaddr*)&address, sizeof(sockaddr_in)) == SOCKET_ERROR) { error(program + ": Connect"); return answer; } #ifdef _OpenSSL_ if (url.getTls()) { tls = true; if (!starttls()) return answer; } #endif if (head) { putline("HEAD " + url.getPath() + " HTTP/1.1"); } else { putline("GET " + url.getPath() + " HTTP/1.1"); } putline("Accept: text/html; text/plain"); #ifndef _OpenSSL_ putline("User-Agent: " + agent(true) + ' ' + platform()); if (url.getPort() == 80) #else putline("User-Agent: " + agent(true) + ' ' + platform() + ' ' + openssl(true)); if (url.getPort() == 80 && tls || url.getPort() == 443 && tls) #endif { putline("Host: " + url.getAddress()); } else { ostringstream port; port << url.getPort(); putline("Host: " + url.getAddress() + ':' + port.str()); } if (referer != "") { putline("Referer: " + referer); } putline("Connection: close"); putline(); code response; string line; do { line = getline(); if (line.find("HTTP/") != 0) { return answer; } unsigned dot = line.find('.'); unsigned space = line.find(' '); unsigned major; unsigned minor; istringstream number(line.substr(5, dot - 5) + " " + line.substr(dot + 1, space - dot - 1)); number >> major; number >> minor; if (major > 1) { cerr << program << ": Potentially Incompatible Server: HTTP/" << major << "." << minor << "\n"; return answer; } number.clear(); number.str(line.substr(space + 1, 3)); number >> response; if (response < ok) do line = getline(); while (line != ""); } while (response < ok); do { line = getline(); if (line != "") { unsigned colon = line.find(':'); string field = line.substr(0, colon); string value = line.substr(colon + 1); while (isspace(value[0])) value.erase(0, 1); if (field == "Content-Type") { type = value; } else if (field == "Content-Length") { istringstream number(value); number >> length; } else if (field == "Location") { location = value; } else if (field == "Transfer-Encoding") { chunked = value == "chunked"; } } } while (line != ""); switch (response) { case ok: if (debug) cerr << "response = " << response << "\n"; answer = true; break; case choices: case moved: case found: if (debug) cerr << "response = " << response << "\n" << "location = " << location << "\n"; location = getLink(location, url); break; case notfound: case internal: if (debug) cerr << "response = " << response << "\n"; break; default: if (debug) cerr << "response = " << response << "\n"; if (response <= 299) { answer = true; } else if (response <= 399) { location = getLink(location, url); } break; } if (!head && answer) populate(); return answer; } void HttpHandler::clear() { if (tls) { SSL_shutdown(ssl); SSL_free(ssl); SSL_CTX_free(ctx); } closesocket(http); type = ""; length = 0; location = ""; page.clear(); page.str(""); chunked = false; #ifdef _OpenSSL_ tls = false; #endif } void HttpHandler::populate() { if (!chunked) { unsigned left = length; while (left > 0) { memset(buffer, 0, BUFSIZ + 1); unsigned bytes = left > BUFSIZ ? BUFSIZ : left; long received; while (true) { #ifndef _OpenSSL_ if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR) { error(program + ": Recv"); exit(1); } #else if ((received = !tls ? recv(http, buffer, bytes, 0) : SSL_read(ssl, buffer, bytes)) <= 0) { !tls ? error(program + ": Recv") : error(program + ": SSL Read", int(received)); } #endif else if (received != bytes) { left -= received; page << buffer; memset(buffer, 0, BUFSIZ + 1); bytes -= received; } else { break; } } page << buffer; left -= bytes; } } else { unsigned chunk; do { istringstream number(getline()); number.setf(ios_base::hex, ios_base::basefield); number >> chunk; unsigned left = chunk; while (left > 0) { memset(buffer, 0, BUFSIZ + 1); unsigned bytes = left > BUFSIZ ? BUFSIZ : left; long received; while (true) { #ifndef _OpenSSL_ if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR) { error(program + ": Recv"); exit(1); } #else if ((received = !tls ? recv(http, buffer, bytes, 0) : SSL_read(ssl, buffer, bytes)) <= 0) { !tls ? error(program + ": Recv") : error(program + ": SSL Read", int(received)); exit(1); } #endif else if (received != bytes) { left -= received; page << buffer; memset(buffer, 0, BUFSIZ + 1); bytes -= received; } else { break; } } page << buffer; left -= bytes; } getline(); length += chunk; } while (chunk > 0); } if (!binary) { string page = this->page.str(); for (unsigned index = 0; index < page.length(); index++) { if (page[index] == '\r' && (index + 1 < page.length()) ? page[index + 1] == '\n' : false) { page.erase(index, 1); } else if (page[index] == '\r') { page[index] = '\n'; } } this->page.str(page); } } void HttpHandler::putline(const string line) { sprintf(buffer, "%s\r\n", line.c_str()); #ifndef _OpenSSL_ if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR) { error(program + ": Send"); exit(1); } #else if (!tls) { if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR) { error(program + ": Send"); exit(1); } } else { int number; if ((number = SSL_write(ssl, buffer, strlen(buffer))) <= 0) { error(program + ": SSL Write", number); exit(1); } } #endif } string HttpHandler::getline() { string line; char byte; do { #ifndef _OpenSSL_ if (recv(http, &byte, 1, 0) == SOCKET_ERROR) { error(program + ": Recv"); } #else if (!tls) { if (recv(http, &byte, 1, 0) == SOCKET_ERROR) { error(program + ": Recv"); } } else { int number; if ((number = SSL_read(ssl, &byte, 1)) <= 0) { error(program + ": SSL Read", number); } } #endif if (byte != '\r' && byte != '\n') { line += byte; } } while (byte != '\n'); return line; } void HttpHandler::error(const string& prefix, bool host) { #ifdef _WIN32 string error; switch (WSAGetLastError()) { case WSAEACCES: error = "Permission denied"; break; case WSAEADDRINUSE: error = "Address already in use"; break; case WSAEADDRNOTAVAIL: error = "Cannot assign requested address"; break; case WSAEAFNOSUPPORT: error = "Address family not supported by protocol family"; break; case WSAEALREADY: error = "Operation already in progress"; break; case WSAECONNABORTED: error = "Software caused connection abort"; break; case WSAECONNREFUSED: error = "Connection refused"; break; case WSAECONNRESET: error = "Connection reset by peer"; break; case WSAEDESTADDRREQ: error = "Destination address required"; break; case WSAEFAULT: error = "Bad address"; break; case WSAEHOSTDOWN: error = "Host is down"; break; case WSAEHOSTUNREACH: error = "No route to host"; break; case WSAEINPROGRESS: error = "Operation now in progress"; break; case WSAEINTR: error = "Interrupted function call"; break; case WSAEINVAL: error = "Invalid argument"; break; case WSAEISCONN: error = "Socket is already connected"; break; case WSAEMFILE: error = "Too many open files"; break; case WSAEMSGSIZE: error = "Message too long"; break; case WSAENETDOWN: error = "Network is down"; break; case WSAENETRESET: error = "Network dropped connection on reset"; break; case WSAENETUNREACH: error = "Network is unreachable"; break; case WSAENOBUFS: error = "No buffer space available"; break; case WSAENOPROTOOPT: error = "Bad protocol option"; break; case WSAENOTCONN: error = "Socket is not connected"; break; case WSAENOTSOCK: error = "Socket operation on non-socket"; break; case WSAEOPNOTSUPP: error = "Operation not supported"; break; case WSAEPFNOSUPPORT: error = "Protocol family not supported"; break; case WSAEPROCLIM: error = "Too many processes"; break; case WSAEPROTONOSUPPORT: error = "Protocol not supported"; break; case WSAEPROTOTYPE: error = "Protocol wrong type for socket"; break; case WSAESHUTDOWN: error = "Cannot send after socket shutdown"; break; case WSAESOCKTNOSUPPORT: error = "Socket type not supported"; break; case WSAETIMEDOUT: error = "Connection timed out"; break; case WSATYPE_NOT_FOUND: error = "Class type not found"; break; case WSAEWOULDBLOCK: error = "Resource temporarily unavailable"; break; case WSAHOST_NOT_FOUND: error = "Host not found"; break; case WSA_INVALID_HANDLE: error = "Specified event object handle is invalid"; break; case WSA_INVALID_PARAMETER: error = "One or more parameters are invalid"; break; // case WSAINVALIDPROCTABLE: // error = "Invalid procedure table from service provider"; // break; // case WSAINVALIDPROVIDER: // error = "Invalid service provider version number"; // break; case WSA_IO_INCOMPLETE: error = "Overlapped I/O event object not in signaled state"; break; case WSA_IO_PENDING: error = "Overlapped operations will complete later"; break; case WSA_NOT_ENOUGH_MEMORY: error = "Insufficient memory available"; break; case WSANOTINITIALISED: error = "Successful WSAStartup not yet performed"; break; case WSANO_DATA: error = "Valid name, no data record of requested type"; break; case WSANO_RECOVERY: error = "This is a non-recoverable error"; break; // case WSAPROVIDERFAILEDINIT: // error = "Unable to initialize a service provider"; // break; case WSASYSCALLFAILURE: error = "System call failure"; break; case WSASYSNOTREADY: error = "Network subsystem is unavailable"; break; case WSATRY_AGAIN: error = "Non-authoritative host not found"; break; case WSAVERNOTSUPPORTED: error = "WINSOCK.DLL version out of range"; break; case WSAEDISCON: error = "Graceful shutdown in progress"; break; case WSA_OPERATION_ABORTED: error = "Overlapped operation aborted"; break; default: error = "Unknown error"; break; } cerr << prefix << ": " << error << "\n"; #else if (host) { string error; switch (h_errno) { case HOST_NOT_FOUND: error = "Unknown host"; break; case TRY_AGAIN: error = "Host name lookup failure"; break; case NO_RECOVERY: error = "Unknown server error"; break; case NO_DATA: error = "No address associated with name"; break; default: error = "Unknown error"; break; } cerr << prefix << ": " << error << "\n"; } else { perror(prefix.c_str()); } #endif // _WIN32 } #ifdef _OpenSSL_ void HttpHandler::error(const string& prefix, int number) { string error; switch (SSL_get_error(ssl, number)) { case SSL_ERROR_NONE: error = "The TLS/SSL I/O operation completed"; break; case SSL_ERROR_ZERO_RETURN: error = "The TLS/SSL connection has been closed"; break; case SSL_ERROR_WANT_READ: case SSL_ERROR_WANT_WRITE: case SSL_ERROR_WANT_CONNECT: // case SSL_ERROR_WANT_ACCEPT: case SSL_ERROR_WANT_X509_LOOKUP: error = "The operation did not complete"; break; case SSL_ERROR_SYSCALL: if (int err = ERR_get_error() != 0) { error = ERR_reason_error_string(err); } else { switch (number) { case 0: error = "An EOF was observed that violates the protocol"; break; case -1: this->error(prefix); return; default: error = "Unknown error"; break; } } break; case SSL_ERROR_SSL: error = ERR_reason_error_string(ERR_get_error()); break; default: error = "Unknown error"; break; } cerr << prefix << ": " << error << "\n"; } bool HttpHandler::starttls() { SSL_load_error_strings(); SSL_library_init(); #ifndef _urandomdev_ int pid = getpid(); int now = time(NULL); unsigned seed = now > pid ? now - pid : pid - now; char* junk = new char[seed % 30 + 2]; junk[0] = pid; junk[seed % 30 + 1] = now; srand(seed); for (int index = 1; index < seed % 30 + 1; index++) { junk[index] = rand(); } if (debug) { cerr << "junk = {\n"; for (int index = 1; index < seed % 30 + 2; index++) { cerr << " [" << index << "] = " << int(junk[index]) << "\n"; } cerr << "}\n"; } RAND_seed(junk, seed % 30 + 2); delete junk; #endif ctx = SSL_CTX_new(TLSv1_client_method()); if (ctx == NULL) { cerr << program << ": SSL CTX New: " << ERR_reason_error_string(ERR_get_error()) << "\n"; return false; } ssl = SSL_new(ctx); if (SSL_set_fd(ssl, http) == 0) { cerr << program << ": SSL Set FD: " << ERR_reason_error_string(ERR_get_error()) << "\n"; return false; } int number; if ((number = SSL_connect(ssl)) <= 0) { error(program + ": SSL Connect", number); return false; } return true; } #endif istream& operator>>(istream& is, HttpHandler::code& data) { int number; is >> number; data = HttpHandler::code(number); return is; }