/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Main // // Douglas Thrift // // $Id: Search.cpp,v 1.17 2003/07/15 07:22:06 douglas Exp $ #include "Search.h" #include "Indexer.h" #include "Searcher.h" #include "Outputer.h" #ifndef _WIN32 #include #else #include #endif #ifdef _OpenSSL_ #include inline string openssl(bool agent = false) { ostringstream openssl; openssl << "OpenSSL" << (agent ? "/" : " "); long version = SSLeay(); long major = version / 0x10000000; long minor = (version % 0x10000000) / 0x100000; long fix = (version % 0x100000) / 0x1000; long patch = (version % 0x1000); openssl << major << "." << minor << "." << fix; if (patch == 0x0) { openssl << "-dev"; } else if (patch >= 0x1 && patch <= 0xe) { openssl << "-beta" << patch; } else if (patch == 0xf) { // release } else if (patch > 0xf) { openssl << char('a' - 1 + (patch / 0x10) % 26); } openssl << " " << SSLeay_version(SSLEAY_PLATFORM); return openssl.str(); } #endif string program; string programName = "Douglas Thrift's Search Engine"; string programVersion = "1.2alpha"; bool debug = false; int main(int argc, char* argv[]) { program = argv[0]; bool indexMode = false; string indexURL; set indexDomains; set indexRestrictions; unsigned page = 1; string query; vector indices; string header = "header.html"; string body = "body.html"; string footer = "footer.html"; string notfound = "notfound.html"; string pages = "pages.html"; string email; for (int index = 1; index < argc; index++) { string arg(argv[index]); if (arg == "-help") { usage(); return 0; } else if (arg == "-version") { version(); return 0; } else if (arg == "-license") { license(); return 0; } else if (arg == "-P") { if (++index < argc) { page = strtoul(argv[index],0,0); } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-i") { indexMode = true; if (++index < argc) { indexURL = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-d") { if (++index < argc) { indexDomains.insert(argv[index]); } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-r") { if (++index < argc) { indexRestrictions.insert(argv[index]); } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-h") { if (++index < argc) { header = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-b") { if (++index < argc) { body = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-f") { if (++index < argc) { footer = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-n") { if (++index < argc) { notfound = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-p") { if (++index < argc) { pages = argv[index]; } else { cerr << program << ": Bad arguments\n"; usage(); return 1; } } else if (arg == "-D") { debug = true; cerr.setf(ios_base::boolalpha); } else { indices.push_back(arg); } } if (indices.size() < 1) { usage(); return 0; } if (indexMode) { if (indices.size() > 1) { cerr << program << ": Too many indices, can only build one index" << " at a time\n"; usage(); return 1; } if (indexDomains.size() < 1) { cerr << program << ": Must specify at least one domain\n"; usage(); return 1; } Indexer indexer(indices[0], indexDomains, indexRestrictions); indexer.index(indexURL); } else { string line; getline(cin, line); query = line; Searcher searcher(query); searcher.search(indices); Outputer outputer(header, body, footer, notfound, pages); outputer.output(searcher, page < 1 ? page : --page); } return 0; } string agent(bool version) { string agent = programName + (version ? ('/' + programVersion) : ""); return agent; } string platform() { string platform; string os; string version; string architecture; string marketing; #ifdef _WIN32 OSVERSIONINFO* computer = new OSVERSIONINFO; computer->dwOSVersionInfoSize = sizeof(OSVERSIONINFO); GetVersionEx(computer); os = computer->dwPlatformId == VER_PLATFORM_WIN32_NT ? "Windows NT" : "Windows"; unsigned major = computer->dwMajorVersion; unsigned minor = computer->dwMinorVersion; delete computer; SYSTEM_INFO* system = new SYSTEM_INFO; GetSystemInfo(system); switch (system->wProcessorArchitecture) { case PROCESSOR_ARCHITECTURE_INTEL: architecture = "ix86"; break; case PROCESSOR_ARCHITECTURE_MIPS: architecture = "mips"; break; case PROCESSOR_ARCHITECTURE_ALPHA: architecture = "alpha"; break; case PROCESSOR_ARCHITECTURE_PPC: architecture = "ppc"; break; case PROCESSOR_ARCHITECTURE_IA64: architecture = "ia64"; break; case PROCESSOR_ARCHITECTURE_IA32_ON_WIN64: architecture = "ix86_on_win64"; break; case PROCESSOR_ARCHITECTURE_AMD64: architecture = "amd64"; break; default: architecture = "unknown"; break; } char* cversion = new char[1024]; sprintf(cversion, "%u.%u", major, minor); version = cversion; delete [] cversion; if (major == 4 && minor <= 3 && os != "Windows NT") { marketing = " [Windows 95]"; } else if (major == 4 && minor == 10 && os != "Windows NT") { marketing = " [Windows 98]"; } else if (major == 5 && minor == 0 && os == "Windows NT") { marketing = " [Windows 2000]"; } else if (major == 4 && minor == 90 && os != "Windows NT") { marketing = " [Windows ME]"; } else if (major == 5 && minor == 1 && os == "Windows NT") { marketing = " [Windows XP]"; } else if (major == 5 && minor == 2 && os == "Windows NT") { marketing = " [Windows .NET Server]"; } #else // _WIN32 struct utsname* computer = new struct utsname; uname(computer); os = computer->sysname; version = computer->release; architecture = computer->machine; delete computer; #endif // _WIN32 platform = "(" + os + " " + version + marketing + " " + architecture + ")"; return platform; } void usage() { #ifdef _WIN32 OSVERSIONINFO* computer = new OSVERSIONINFO; computer->dwOSVersionInfoSize = sizeof(OSVERSIONINFO); GetVersionEx(computer); string program = ::program; if (computer->dwPlatformId != VER_PLATFORM_WIN32_NT) { program = "Search"; } delete computer; #endif // _WIN32 string tab(8 + program.length(), ' '); cout << "Usage: " << program << " [index ...] [-P page] [-h header] [-b bo" << "dy]\n" << tab << "[-f footer] [-n notfound] [-p pages]\n" << tab << "[-i begin] [-d domain ...] [-r restriction ...]\n" << tab << "[-D] [-version] [-help]\n" << "Options:\n" << " index Index file to use (can only use one file for i" << "ndexing)\n" << " -P page Page of search to display (defaults to 1)\n" << " -h header Header template to use (defaults to header.htm" << "l)\n" << " -b body Body template to use (defaults to body.html)\n" << " -f footer Footer template to use (defaults to footer.htm" << "l)\n" << " -n notfound Not found template to use (defaults to notfoun" << "d.html)\n" << " -p pages Pages template to use (defaults to pages.html)" << "\n" << " -i begin URL to begin indexing (causes indexing rather " << "than search)\n" << " -d domain Domain to include in indexing\n" << " -r restriction URL to restrict from indexing\n" << " -D Display debug information\n" << " -version Display version information and exit\n" << " -license Display license information and exit\n" << " -help Display this message and exit\n"; } void version() { cout << programName << " " << programVersion << " "<< platform() << "\n\n" << " Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.\n" << "\n" << " This product includes software developed by Douglas Thrift\n" << " (http://computers.douglasthrift.net/searchengine/).\n"; #ifdef _OpenSSL_ cout << "\n" << openssl() << "\n"; #endif } void license() { cout << "License:\n" << " Douglas Thrift's Search Engine License\n\n" << " Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.\n" << "\n" << " Redistribution and use in source and binary forms, with or with" << "out\n" << " modification, are permitted provided that the following conditi" << "ons are met:\n\n" << " 1. Redistributions of source code must retain the above copyrig" << "ht notice,\n" << " this list of conditions and the following disclaimer.\n\n" << " 2. Redistributions in binary form must reproduce the above copy" << "right notice,\n" << " this list of conditions and the following disclaimer in the " << "documentation\n" << " and/or other materials provided with the distribution.\n\n" << " 3. The end-user documentation included with the redistribution," << " if any, must\n" << " include the following acknowledgment:\n\n" << " \"This product includes software developed by Douglas Thr" << "ift\n" << " (http://computers.douglasthrift.net/searchengine/).\"\n\n" << " Alternately, this acknowledgment may appear in the software " << "itself, if\n" << " and wherever such third-party acknowledgments normally appea" << "r.\n\n" << " 4. The names \"Douglas Thrift\" and \"Douglas Thrift\'s Search " << "Engine\" must not\n" << " be used to endorse or promote products derived from this sof" << "tware without\n" << " specific prior written permission. For written permission, p" << "lease visit\n" << " http://www.douglasthrift.net/contact.cgi for contact inform" << "ation.\n\n" << " 5. Products derived from this software may not be called \"Doug" << "las Thrift\'s\n" << " Search Engine\", nor may \"Douglas Thrift\'s Search Engine\"" << " appear in their\n" << " name, without prior written permission.\n\n" << " THIS SOFTWARE IS PROVIDED \"AS IS\" AND ANY EXPRESS OR IMPLIED " << "WARRANTIES,\n" << " INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHA" << "NTABILITY AND\n" << " FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SH" << "ALL THE\n" << " COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIR" << "ECT,\n" << " INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU" << "DING, BUT NOT\n" << " LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS O" << "F USE, DATA,\n" << " OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY" << " THEORY OF\n" << " LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCL" << "UDING\n" << " NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF T" << "HIS SOFTWARE,\n" << " EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"; } void entities(string& line, char character, char* entity) { int begin = 0; while (begin < line.length()) { int spot = line.find(character, begin); int end = spot + 1; if (spot != string::npos) { line.replace(spot, 1, entity); } else { break; } begin = end; } } void entities(string& line, char* entity, char character) { int begin = 0; while (begin < line.length()) { int spot = line.find(entity, begin); int end = spot + 1; if (spot != string::npos) { line.replace(spot, strlen(entity), 1, character); } else { break; } begin = end; } } void normalize(string& abbynormal) { for (unsigned index = 0; index < abbynormal.length(); index++) { if (isspace(abbynormal[index])) { unsigned next = index + 1; while (isspace(abbynormal[next])) { next++; } abbynormal.replace(index, next - index, 1, abbynormal[index]); } } if (isspace(abbynormal[0])) abbynormal.erase(0, 1); if (isspace(abbynormal[abbynormal.length() - 1])) abbynormal.erase(abbynormal.length() - 1, 1); }