/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Processor // // Douglas Thrift // // Processor.cpp #include "Processor.h" Processor::Processor() { page = new Page(); } Processor::~Processor() { delete page; } bool Processor::process(HttpHandler& http, URL& url) { string title, description, text; vector headings; if (http.contentType().find("text/html") == 0) { if (!process(http, url, title, description, text, headings)) return false; entities(title, " ", ' '); entities(title, "<", '<'); entities(title, ">", '>'); entities(title, """, '\"'); entities(title, "&", '&'); entities(description, " ", ' '); entities(description, "<", '<'); entities(description, ">", '>'); entities(description, """, '\"'); entities(description, "&", '&'); entities(text, " ", ' '); entities(text, "<", '<'); entities(text, ">", '>'); entities(text, """, '\"'); entities(text, "&", '&'); for (int index = 0; index < headings.size(); index++) { entities(headings[index], " ", ' '); entities(headings[index], "<", '<'); entities(headings[index], ">", '>'); entities(headings[index], """, '\"'); entities(headings[index], "&", '&'); } normalize(title); normalize(description); normalize(text); for (int index0 = 0; index0 < headings.size(); index0++) { normalize(headings[index0]); } } else { string line; while (http.good()) { http.getline(line); text += line + "\n"; } normalize(text); } page->setSize(http.contentLength()); page->setURL(url); page->setTitle(title); page->setDescription(description); page->setText(text); page->setHeadings(headings); return true; } void Processor::reset() { links.clear(); delete page; page = new Page(); } bool Processor::process(HttpHandler& http, URL& url, string& title, string& description, string& text, vector& headings) { bool inHtml = false, inHead = false, inTitle = false, inBody = false, inHeading = false, inComment = false, follow = true, answer = true; unsigned startComment = 0, finishComment = 0; string line; while (http.good()) { http.getline(line); string heading; unsigned begin = 0; while (begin < line.length()) { unsigned open = line.find('<', begin); unsigned close = line.find('>', begin); string next; while (close == string::npos && http.good()) { http.getline(next); line += '\n' + next; close = line.find('>', begin); } // strangely this is necessary sometimes if (open == string::npos) open = line.find('<', begin); string between = line.substr(begin, open - begin); string tag = getTag(line, open, close); string lowerTag(tag.length(), ' '); for (unsigned index = 0; index < tag.length(); index++) { lowerTag[index] = tolower(tag[index]); } if (inHtml && !inComment) { if (inHead && inTitle) { title += between + "\n"; } if (inBody) { text += between + "\n"; } if (inBody && inHeading) { heading += between + "\n"; } if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n") == 0) || (lowerTag.find("meta ") == 0)) && inHead) { if (lowerTag.find("name=robots") != string::npos || lowerTag.find("name=\"robots\"") != string::npos) { unsigned start = lowerTag.find("content=\"") + 9; unsigned finish = lowerTag.find('\"', start); string robots = lowerTag.substr(start, finish - start); if ((robots.find("noindex") != string::npos && robots.find("nofollow") != string::npos) || robots.find("none") != string::npos) { answer = false; follow = false; links.clear(); return answer; } else if (robots.find("noindex") != string::npos) { answer = false; } else if (robots.find("nofollow") != string::npos) { follow = false; links.clear(); } } else if (lowerTag.find("name=description") != string::npos || lowerTag.find("name=\"description\"") != string::npos) { unsigned start = lowerTag.find("content=\"") + 9; unsigned finish = lowerTag.find('\"', start); description = tag.substr(start, finish - start); } } if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0) || (lowerTag.find("a ") == 0)) && inBody && follow) { if (lowerTag.find("href=\"") != string::npos) { unsigned start = lowerTag.find("href=\"") + 6; unsigned finish = lowerTag.find('\"', start); string link = getLink(tag.substr(start, finish - start), url); if (link != "") links.insert(link); } else if (lowerTag.find("href=") != string::npos) { unsigned start = lowerTag.find("href=") + 5; unsigned finish = lowerTag.find(' ', start); if (finish < close) { string link = getLink(tag.substr(start, finish - start), url); if (link != "") links.insert(link); } else { string link = getLink(tag.substr(start, close - start), url); if (link != "") links.insert(link); } } } if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") == 0) || (lowerTag.find("img ")) && inBody) { if (lowerTag.find("alt=\"") != string::npos) { unsigned start = lowerTag.find("alt=\"") + 5; unsigned finish = lowerTag.find('\"', start); text += tag.substr(start, finish - start) + ' '; if (inHeading) heading += tag.substr(start, finish - start) + ' '; } else if (lowerTag.find("alt=") != string::npos) { unsigned start = lowerTag.find("alt=") + 4; unsigned finish = lowerTag.find(' ', start); if (finish < close) { text += tag.substr(start, finish - start) + ' '; if (inHeading) heading += tag.substr(start, finish - start) + ' '; } else { text += tag.substr(start, close - start) + ' '; if (inHeading) heading += tag.substr(start, close - start) + ' '; } } } } if (lowerTag.find("html") == 0) inHtml = true; if (lowerTag.find("/html") == 0) inHtml = false; if (lowerTag.find("head") == 0) inHead = true; if (lowerTag.find("/head") == 0) inHead = false; if (lowerTag.find("title") == 0) inTitle = true; if (lowerTag.find("/title") == 0) inTitle = false; if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0) inBody = true; if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0) inBody = false; if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 || lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 || lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0) inHeading = true; if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 || lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 || lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0) { if (heading != "") headings.push_back(heading); inHeading = false; } if (lowerTag.find("!--") == 0) { startComment = open; inComment = true; } if (line.find("-->", begin) >= startComment && line.find("-->", begin) != string::npos) { finishComment = line.find("-->", begin) + 3; inComment = false; } if (close == string::npos) { begin = close; } else { begin = close + 1; } } startComment = 0; finishComment = 0; } return answer; } string Processor::getTag(const string& line, unsigned open, unsigned close) { string tag = line.substr(open + 1, close - open - 1); return tag; }