/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Searcher // // Douglas Thrift // // Searcher.cpp #include "Searcher.h" Searcher::Searcher(string& queryString) { setCommon(); setAnd(false); setOr(false); this->queryString = queryString; setQuery(); } void Searcher::search(vector indices) { start = clock(); const string XMLTYPE = ""); const string DOCTYPE = ""; if (query.size() > 0) { for (int index = 0; index < indices.size(); index++) { ifstream fin(indices[index].c_str()); if (!fin.is_open()) { cerr << program << ": Could not open index file: " << indices[index] << "\n"; fin.clear(); continue; } string line; getline(fin, line); if (line != XMLTYPE) { cerr << program << ": Invalid XML version declaration: " << indices[index] << "\n"; fin.close(); fin.clear(); continue; } getline(fin, line); if (line != DOCTYPE) { cerr << program << ": Invalid XML doctype: " << indices[index] << "\n"; fin.close(); fin.clear(); continue; } getline(fin, line); if (line != "") { cerr << program << ": Invalid XML root element: " << indices[index] << "\n"; fin.close(); fin.clear(); continue; } while (fin.good()) { Page page; fin >> page; if (!page.empty()) search(page); } fin.close(); fin.clear(); } } finish = clock(); } void Searcher::search(Page& page) { Ranker ranker(page); ranker.rank(query); if (ranker != 0) { ranker.setSample(); pages.insert(ranker); } } void Searcher::setQuery() { string queryStringNormal = queryString; normalize(queryStringNormal); if (debug) cerr << "queryString = " << queryString << "\n" << "queryStringNormal = " << queryStringNormal << "\n"; unsigned begin = 0; bool capitalOr = false, quotes = false; while (begin < queryStringNormal.length()) { string word; unsigned beginQuote = queryStringNormal.find('\"', begin); unsigned endQuote = beginQuote != string::npos ? queryStringNormal.find('\"', beginQuote + 1) : string::npos; unsigned space = queryStringNormal.find(' ', begin); if (beginQuote == begin) { quotes = true; word = queryStringNormal.substr(beginQuote + 1, endQuote - beginQuote - 1); } else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin, 8 ) == "intitle:") { quotes = true; word = "intitle:" + queryStringNormal.substr(beginQuote + 1, endQuote - beginQuote - 1); beginQuote = begin; } else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin, 6 ) == "inurl:") { quotes = true; word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote - beginQuote - 1); beginQuote = begin; } else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin, 7 ) == "intext:") { quotes = true; word = "intext:" + queryStringNormal.substr(beginQuote + 1, endQuote - beginQuote - 1); beginQuote = begin; } else { quotes = false; word = queryStringNormal.substr(begin, space - begin); } if (word != "") { string lowerWord(word.length(), ' '); for (unsigned index = 0; index < word.length(); index++) { lowerWord[index] = tolower(word[index]); } if (debug) cerr << "word = " << word << "\n" << "lowerWord = " << lowerWord << "\n"; if (word == "OR" && !quotes) { capitalOr = true; query[query.size() - 1] += " OR"; } else if (lowerWord == "or" && !quotes) { setOr(true); } else if (lowerWord == "and" && !quotes) { setAnd(true); } else if (common.find(lowerWord) != common.end() && !quotes) { commonUsed.push_back(word); } else { if (capitalOr) capitalOr = false; if (query.size() < 10) { query.push_back(lowerWord); } else { setIgnore(lowerWord); if (debug) cerr << "ignore = " << ignore << "\n"; break; } } } if (endQuote == string::npos && space == string::npos) { begin = string::npos; } else if (beginQuote == begin && endQuote != string::npos) { begin = endQuote + 1; } else { begin = space + 1; } } if (debug) { cerr << "query = {\n"; for (unsigned index = 0; index < query.size(); index++) { cerr << " [" << index << "] = " << query[index] << "\n"; } cerr << "}\n"; } } void Searcher::setCommon() { common.insert("&"); common.insert("a"); common.insert("about"); common.insert("an"); // "and" has its own special message // common.insert("and"); common.insert("are"); common.insert("as"); common.insert("at"); common.insert("be"); common.insert("by"); common.insert("com"); common.insert("from"); common.insert("how"); common.insert("i"); common.insert("in"); common.insert("is"); common.insert("it"); common.insert("of"); common.insert("on"); // "or" has its own special message // common.insert("or"); common.insert("that"); common.insert("the"); common.insert("this"); common.insert("to"); common.insert("was"); common.insert("what"); common.insert("when"); common.insert("where"); common.insert("which"); common.insert("who"); common.insert("why"); common.insert("will"); common.insert("with"); }