/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Indexer // // Douglas Thrift // // $Id$ #include "Indexer.hpp" #ifndef _WIN32 #include #else // _WIN32 inline int unlink(const char* filename) { return DeleteFile(filename); } #endif // _WIN32 void Indexer::index(string& begin) { size_t separator(indexFile.rfind(slash)); string dtd(separator != string::npos ? indexFile.substr(0, separator) + slash + "index.dtd" : "index.dtd"); ifstream fin(dtd.c_str()); if (!fin.is_open()) { ofstream fout(dtd.c_str()); fout << "\n" << "\n" << "\n" << "\n" << "\n" << "\n" << "\n" << "\n" << "\n" << "\n" << "\n"; fout.close(); } fin.close(); string lock(indexFile + ".lock"); ofstream fout(lock.c_str()); fout.close(); fout.open(indexFile.c_str()); fout << "" << "\n\n" << "\n"; URL first(begin); index(first, fout); while (!links.empty()) { URL next(links.front()); string referer(referers.front()); links.pop(); referers.pop(); if (debug) cerr << "next = " << next << "\n"; index(next, fout, referer); } fout << "\n"; fout.close(); unlink(lock.c_str()); } void Indexer::index(URL& url, ofstream& fout, const string& referer) { if (domains.find(url.getAddress()) != domains.end() && pages.find(url.getURL()) == pages.end()) { if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" + url.getPort() : string(""))) == checked.end()) { robots(url); } if (!restricted(url)) { if (http.handle(url, referer, true)) { if (http.contentType().find("text/plain") == 0 || http.contentType().find("text/html") == 0) { http.clear(); if (!http.handle(url, referer)) exit(1); cout << "Indexing " << url << " ... " << flush; if (processor.process(http, url)) { Page page(processor.getPage()); fout << page << "\n"; cout << "done.\n"; } else { cout << "canceled.\n"; } pages.insert(url.getURL()); Set pageLinks(processor.getLinks()); processor.reset(); for (SetIterator link(pageLinks.begin()); link != pageLinks.end(); link++) { if (pages.find(*link) == pages.end()) { links.push(*link); referers.push(url.getURL()); } } } else { // unhandled content } } else if (!http.redirect().empty()) { if (pages.find(http.redirect()) == pages.end()) { links.push(http.redirect()); referers.push(url.getURL()); } } http.clear(); } } } bool Indexer::restricted(URL& url) { bool answer(false); for (SetIterator itor(restrictions.begin()); itor != restrictions.end(); itor++) { URL checker(*itor); if (url.getAddress() == checker.getAddress() && url.getPort() == checker.getPort()) { if (url.getPath().find(checker.getPath()) == 0) { answer = true; break; } } } return answer; } void Indexer::robots(URL& url) { URL robots(url); robots.setPath("/robots.txt"); if (http.handle(robots)) { cout << "Checking " << robots << " ... " << flush; string line; bool record(false), hasVersion(false), hasName(false), hasAll(false); Robot state(none); Set restrictionsVersion, restrictionsName, restrictionsAll; while (http.good()) { http.getline(line); size_t comment(line.find('#')); if (comment != string::npos) line.erase(comment); if (line.empty() && comment == string::npos) record = false; if (line.empty()) continue; size_t colon(line.find(':')); string field(line.substr(0, colon)); string value(line.substr(colon + 1)); normalize(value); if (field == "User-agent" && value == agent(true)) { state = version; record = true; hasVersion = true; } else if (field == "User-agent" && value == agent(false)) { state = name; record = true; hasName = true; } else if (field == "User-agent" && value == "*") { state = all; record = true; hasAll = true; } else if (field == "Disallow" && record && value.empty()) { // no restrictions } else if (field == "Disallow" && record) { URL restriction(robots); restriction.setPath(value); switch (state) { case version: restrictionsVersion.insert(restriction.getURL()); break; case name: restrictionsName.insert(restriction.getURL()); break; case all: restrictionsAll.insert(restriction.getURL()); break; } } } if (hasVersion) { state = version; } else if (hasName) { state = name; } else if (hasAll) { state = all; } else { state = none; } SetIterator itor; switch (state) { case version: for (itor = restrictionsVersion.begin(); itor != restrictionsVersion.end(); itor++) { restrictions.insert(*itor); } break; case name: for (itor = restrictionsName.begin(); itor != restrictionsName.end(); itor++) { restrictions.insert(*itor); } break; case all: for (itor = restrictionsAll.begin(); itor != restrictionsAll.end(); itor++) { restrictions.insert(*itor); } break; } cout << "done.\n"; } http.clear(); checked.insert(url.getAddress() += url.getPort() != 80 ? ":" + url.getPort() : ""); }