/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine HTTP Handler // // Douglas Thrift // // $Id: HttpHandler.cpp,v 1.18 2003/07/11 07:54:46 douglas Exp $ #include "HttpHandler.h" // Lovely C Sockets! #ifndef _WIN32 // BSD Sockets #include #include #include #include #include #define INVALID_SOCKET -1 #define SOCKET_ERROR -1 inline int closesocket(SOCKET s) { return close(s); } #endif HttpHandler::HttpHandler() { buffer = new char[BUFSIZ + 1]; #ifdef _WIN32 if (WSAStartup(MAKEWORD(2, 0), &data) != 0) { error(program + ": WSAStartup"); exit(1); } #endif // _WIN32 length = 0; chunked = false; } HttpHandler::~HttpHandler() { delete [] buffer; #ifdef _WIN32 WSACleanup(); #endif // _WIN32 } bool HttpHandler::handle(URL &url, const string referer, bool head) { bool answer = false; if ((http = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) { error(program + ": Socket"); exit(1); } sockaddr_in address; hostent* host; address.sin_family = AF_INET; if ((host = gethostbyname(url.getAddress().c_str())) == NULL) { error(program + ": Host: " + url.getAddress(), true); return answer; } address.sin_addr = *((in_addr*)*host->h_addr_list); address.sin_port = htons(url.getPort()); if (connect(http, (sockaddr*)&address, sizeof(sockaddr_in)) == SOCKET_ERROR) { error(program + ": Connect"); return answer; } if (head) { putline("HEAD " + url.getPath() + " HTTP/1.1"); } else { putline("GET " + url.getPath() + " HTTP/1.1"); } putline("Accept: text/html; text/plain"); putline("User-Agent: " + agent(true) + ' ' + platform()); if (url.getPort() == 80) { putline("Host: " + url.getAddress()); } else { char* port = new char[1024]; sprintf(port, "%u", url.getPort()); putline("Host: " + url.getAddress() + ':' + port); delete [] port; } if (referer != "") { putline("Referer: " + referer); } putline("Connection: close"); putline(); code response; string line; do { line = getline(); if (line.find("HTTP/") != 0) { return answer; } unsigned dot = line.find('.'); unsigned space = line.find(' '); unsigned major = strtoul(line.substr(5, dot - 5).c_str(), 0, 10); unsigned minor = strtoul(line.substr(dot + 1, space - dot - 1).c_str(), 0, 10); if (major > 1) { cerr << program << ": Potentially Incompatible Server: HTTP/" << major << "." << minor << "\n"; return answer; } response = code(strtoul(line.substr(space + 1).c_str(), 0, 10)); if (response < ok) do line = getline(); while (line != ""); } while (response < ok); do { line = getline(); if (line != "") { unsigned colon = line.find(':'); string field = line.substr(0, colon); string value = line.substr(colon + 1); while (isspace(value[0])) value.erase(0, 1); if (field == "Content-Type") { type = value; } else if (field == "Content-Length") { length = strtoul(value.c_str(), 0, 10); } else if (field == "Location") { location = value; } else if (field == "Transfer-Encoding") { chunked = value == "chunked"; } } } while (line != ""); switch (response) { case ok: if (debug) cerr << "response = " << response << "\n"; answer = true; break; case choices: case moved: case found: if (debug) cerr << "response = " << response << "\n" << "location = " << location << "\n"; location = getLink(location, url); break; case notfound: case internal: if (debug) cerr << "response = " << response << "\n"; break; default: if (debug) cerr << "response = " << response << "\n"; if (response <= 299) { answer = true; } else if (response <= 399) { location = getLink(location, url); } break; } if (!head && answer) populate(); return answer; } HttpHandler& HttpHandler::getline(string& line, char endline) { unsigned end = page.find(endline); unsigned newline = page.find('\n'); if (newline < end || end == string::npos) { end = newline; } line = page.substr(0, end); page.erase(0, (end == string::npos ? end : end + 1)); return *this; } void HttpHandler::clear() { closesocket(http); type = ""; length = 0; location = ""; page = ""; chunked = false; } void HttpHandler::populate() { if (!chunked) { unsigned left = length; while (left > 0) { memset(buffer, 0, BUFSIZ + 1); unsigned bytes = left > BUFSIZ ? BUFSIZ : left; unsigned received; while (true) { if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR) { error(program + ": Recv"); exit(1); } else if (received != bytes) { left -= received; page += buffer; memset(buffer, 0, BUFSIZ + 1); bytes -= received; } else { break; } } page += buffer; left -= bytes; } } else { unsigned chunk; do { chunk = strtoul(getline().c_str(), 0, 16); unsigned left = chunk; while (left > 0) { memset(buffer, 0, BUFSIZ + 1); unsigned bytes = left > BUFSIZ ? BUFSIZ : left; unsigned received; while (true) { if ((received = recv(http, buffer, bytes, 0)) == SOCKET_ERROR) { error(program + ": Recv"); exit(1); } else if (received != bytes) { left -= received; page += buffer; memset(buffer, 0, BUFSIZ + 1); bytes -= received; } else { break; } } page += buffer; left -= bytes; } getline(); length += chunk; } while (chunk > 0); } for (unsigned index = 0; index < page.length(); index++) { if (page[index] == '\r' && (index + 1 < page.length()) ? page[index + 1] == '\n' : false) { page.erase(index, 1); } else if (page[index] == '\r') { page[index] = '\n'; } } } void HttpHandler::putline(const string line) { sprintf(buffer, "%s\r\n", line.c_str()); if (send(http, buffer, strlen(buffer), 0) == SOCKET_ERROR) { error(program + ": Send"); exit(1); } } string HttpHandler::getline() { string line; char byte; do { if (recv(http, &byte, 1, 0) == SOCKET_ERROR) { error(program + ": Recv"); } if (byte != '\r' && byte != '\n') { line += byte; } } while (byte != '\n'); return line; } void HttpHandler::error(const string& prefix, bool host) { #ifdef _WIN32 string error; switch (WSAGetLastError()) { case WSAEACCES: error = "Permission denied"; break; case WSAEADDRINUSE: error = "Address already in use"; break; case WSAEADDRNOTAVAIL: error = "Cannot assign requested address"; break; case WSAEAFNOSUPPORT: error = "Address family not supported by protocol family"; break; case WSAEALREADY: error = "Operation already in progress"; break; case WSAECONNABORTED: error = "Software caused connection abort"; break; case WSAECONNREFUSED: error = "Connection refused"; break; case WSAECONNRESET: error = "Connection reset by peer"; break; case WSAEDESTADDRREQ: error = "Destination address required"; break; case WSAEFAULT: error = "Bad address"; break; case WSAEHOSTDOWN: error = "Host is down"; break; case WSAEHOSTUNREACH: error = "No route to host"; break; case WSAEINPROGRESS: error = "Operation now in progress"; break; case WSAEINTR: error = "Interrupted function call"; break; case WSAEINVAL: error = "Invalid argument"; break; case WSAEISCONN: error = "Socket is already connected"; break; case WSAEMFILE: error = "Too many open files"; break; case WSAEMSGSIZE: error = "Message too long"; break; case WSAENETDOWN: error = "Network is down"; break; case WSAENETRESET: error = "Network dropped connection on reset"; break; case WSAENETUNREACH: error = "Network is unreachable"; break; case WSAENOBUFS: error = "No buffer space available"; break; case WSAENOPROTOOPT: error = "Bad protocol option"; break; case WSAENOTCONN: error = "Socket is not connected"; break; case WSAENOTSOCK: error = "Socket operation on non-socket"; break; case WSAEOPNOTSUPP: error = "Operation not supported"; break; case WSAEPFNOSUPPORT: error = "Protocol family not supported"; break; case WSAEPROCLIM: error = "Too many processes"; break; case WSAEPROTONOSUPPORT: error = "Protocol not supported"; break; case WSAEPROTOTYPE: error = "Protocol wrong type for socket"; break; case WSAESHUTDOWN: error = "Cannot send after socket shutdown"; break; case WSAESOCKTNOSUPPORT: error = "Socket type not supported"; break; case WSAETIMEDOUT: error = "Connection timed out"; break; case WSATYPE_NOT_FOUND: error = "Class type not found"; break; case WSAEWOULDBLOCK: error = "Resource temporarily unavailable"; break; case WSAHOST_NOT_FOUND: error = "Host not found"; break; case WSA_INVALID_HANDLE: error = "Specified event object handle is invalid"; break; case WSA_INVALID_PARAMETER: error = "One or more parameters are invalid"; break; // case WSAINVALIDPROCTABLE: // error = "Invalid procedure table from service provider"; // break; // case WSAINVALIDPROVIDER: // error = "Invalid service provider version number"; // break; case WSA_IO_INCOMPLETE: error = "Overlapped I/O event object not in signaled state"; break; case WSA_IO_PENDING: error = "Overlapped operations will complete later"; break; case WSA_NOT_ENOUGH_MEMORY: error = "Insufficient memory available"; break; case WSANOTINITIALISED: error = "Successful WSAStartup not yet performed"; break; case WSANO_DATA: error = "Valid name, no data record of requested type"; break; case WSANO_RECOVERY: error = "This is a non-recoverable error"; break; // case WSAPROVIDERFAILEDINIT: // error = "Unable to initialize a service provider"; // break; case WSASYSCALLFAILURE: error = "System call failure"; break; case WSASYSNOTREADY: error = "Network subsystem is unavailable"; break; case WSATRY_AGAIN: error = "Non-authoritative host not found"; break; case WSAVERNOTSUPPORTED: error = "WINSOCK.DLL version out of range"; break; case WSAEDISCON: error = "Graceful shutdown in progress"; break; case WSA_OPERATION_ABORTED: error = "Overlapped operation aborted"; break; default: error = "Unknown error"; break; } cerr << prefix << ": " << error << "\n"; #else if (host) { string error; switch (h_errno) { case HOST_NOT_FOUND: error = "Unknown host"; break; case TRY_AGAIN: error = "Host name lookup failure"; break; case NO_RECOVERY: error = "Unknown server error"; break; case NO_DATA: error = "No address associated with name"; break; default: error = "Unknown error"; break; } cerr << prefix << ": " << error << "\n"; } else { perror(prefix.c_str()); } #endif // _WIN32 }