/* ============================================================================ * Douglas Thrift's Search Engine License * * Copyright (C) 2002, Douglas Thrift. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: * * "This product includes software developed by Douglas Thrift * (http://computers.douglasthrift.net/searchengine/)." * * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not * be used to endorse or promote products derived from this software without * specific prior written permission. For written permission, please visit * http://www.douglasthrift.net/contact.cgi for contact information. * * 5. Products derived from this software may not be called "Douglas Thrift's * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their * name, without prior written permission. * * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ============================================================================ */ // Douglas Thrift's Search Engine Ranker // // Douglas Thrift // // Ranker.cpp #include "Ranker.h" Ranker::Ranker() { value = 0; requiredValue = 0; excludedValue = 0; eitherOrValue = 0; allIn = all; } Ranker::Ranker(Page& page) : Page(page) { value = 0; requiredValue = 0; excludedValue = 0; eitherOrValue = 0; allIn = all; } void Ranker::rank(vector query) { vector prep; for (unsigned index = 0; index < query.size(); index++) { if (query[index] == "allintitle:" && index == 0) { allIn = title; } else if (query[index] == "allinurl:" && index == 0) { allIn = url; } else if (query[index] == "allintext:" && index == 0) { allIn = text; } else if (query[index].find("site:") == 0 && query[index].size() > 5) { site = query[index].substr(5); } else if (query[index].find("intitle:") == 0 && query[index].size() > 8) { prep.push_back("TITLE " + query[index].substr(8)); } else if (query[index].find("inurl:") == 0 && query[index].size() > 6) { prep.push_back("URL " + query[index].substr(6)); } else if (query[index].find("intext:") == 0 && query[index].size() > 7) { prep.push_back("TEXT " + query[index].substr(7)); } else { prep.push_back(query[index]); } } if (prep.size() > 0) { bool or_ = false; for (unsigned index = 0; index < prep.size(); index++) { bool exclude = false; if (prep[index].find('+') == 0) { prep[index].erase(0, 1); } else if (prep[index].find('-') == 0) { exclude = true; prep[index].erase(0, 1); } if (or_) { if (prep[index].find(" OR") == string::npos) { or_ = false; } eitherOr[eitherOr.size() - 1] += ' ' + prep[index]; } else if (exclude) { excluded.push_back(prep[index]); } else if (prep[index].find(" OR") != string::npos) { or_ = true; eitherOr.push_back(prep[index]); } else { required.push_back(prep[index]); } } } rank(); } void Ranker::setSample() { map::iterator itor; multimap::iterator> distances; for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++) { unsigned distance; if (++itor != occurrencesText.end()) { unsigned next = itor->first; itor--; distance = next - (itor->first + itor->second); } else { distance = UINT_MAX; itor--; } distances.insert(pair::iterator>(distance, itor)); } if (distances.begin() != distances.end()) { itor = distances.begin()->second; } string portion; unsigned sampleLength = 0, begin = 0, end = string::npos; while (sampleLength < 160 && itor != occurrencesText.end()) { unsigned found = itor->first; unsigned length = itor->second; for (unsigned index = found; index > begin; index--) { if (index == begin) cerr << "Oh crap, I'm insane!\n"; if (found - index >= 160 - sampleLength - length) { for (; index < found; index++) { if (isspace(getText()[index])) break; } begin = index + 1; break; } else if ((index > begin ? (isupper(getText()[index]) && !isalnum(getText()[index - 1])) : isupper(getText()[index])) && index != found) { begin = index; break; } } if (end + 1 != begin) sample += " ... "; portion = getText().substr(begin, found - begin); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + ""; portion = getText().substr(found, length); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + ""; begin = found + length; end = begin - 1; if (++itor != occurrencesText.end()) { if (itor->first + itor->second < begin + 160 - sampleLength) { portion = getText().substr(begin, itor->first - begin); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion; begin = itor->first; end = begin - 1; } else { for (end = begin + 160 - sampleLength; end > begin; end--) { if (isspace(getText()[end])) break; } portion = getText().substr(begin, end - begin + 1); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion + " ..."; break; } } else { for (end = begin + 160 - sampleLength; end > begin && (end + 1 < getText().length()); end--) { if (isspace(getText()[end])) break; } if (end >= getText().length()) end = getText().length() - 1; portion = getText().substr(begin, end - begin + 1); sampleLength += portion.length(); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); sample += portion; if (end + 1 < getText().length()) { sample += " ..."; } break; } } } string Ranker::getTitle() { string title, portion; unsigned begin = 0; for (map::iterator itor = occurrencesTitle.begin(); itor != occurrencesTitle.end(); itor++) { unsigned found = itor->first; unsigned length = itor->second; portion = Page::getTitle().substr(begin, found - begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion + ""; portion = Page::getTitle().substr(found, length); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion + ""; begin = found + length; } portion = Page::getTitle().substr(begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); title += portion; return title; } string Ranker::getDescription() { string description, portion; unsigned begin = 0; for (map::iterator itor = occurrencesDescription.begin(); itor != occurrencesDescription.end(); itor++) { unsigned found = itor->first; unsigned length = itor->second; portion = Page::getDescription().substr(begin, found - begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion + ""; portion = Page::getDescription().substr(found, length); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion + ""; begin = found + length; } portion = Page::getDescription().substr(begin); entities(portion, '&', "&"); entities(portion, '\"', """); entities(portion, '<', "<"); entities(portion, '>', ">"); description += portion; return description; } bool Ranker::operator==(const unsigned number) const { return value == number; } bool Ranker::operator==(const Ranker& ranker) const { return value == ranker.value; } bool Ranker::operator!=(const unsigned number) const { return value != number; } bool Ranker::operator!=(const Ranker& ranker) const { return value != ranker.value; } bool Ranker::operator<(const unsigned number) const { return value < number; } bool Ranker::operator<(const Ranker& ranker) const { return value < ranker.value; } bool Ranker::operator>(const unsigned number) const { return value > number; } bool Ranker::operator >(const Ranker& ranker) const { return value > ranker.value; } void Ranker::rank() { lowerAddress = string(getAddress().length(), ' '); for (unsigned index = 0; index < lowerAddress.length(); index++) { lowerAddress[index] = tolower(getAddress()[index]); } if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() - site.length()) { bool isRequired = required.size() > 0; bool isExcluded = excluded.size() > 0; bool isEitherOr = eitherOr.size() > 0; lowerURL = string(getURL().length(), ' '); for (unsigned index = 0; index < lowerURL.length(); index++) { lowerURL[index] = tolower(getURL()[index]); } lowerTitle = string(Page::getTitle().length(), ' '); for (unsigned index0 = 0; index0 < lowerTitle.length(); index0++) { lowerTitle[index0] = tolower(Page::getTitle()[index0]); } lowerText = string(Page::getText().length(), ' '); for (unsigned index1 = 0; index1 < lowerText.length(); index1++) { lowerText[index1] = tolower(Page::getText()[index1]); } if (isRequired) checkRequired(); if (isExcluded && (isRequired || isEitherOr)) checkExcluded(); if (isEitherOr) checkEitherOr(); if (isRequired && isExcluded && isEitherOr) { value += requiredValue && !excludedValue && eitherOrValue ? requiredValue + eitherOrValue : 0; } else if (isRequired && isExcluded) { value += requiredValue && !excludedValue ? requiredValue : 0; } else if (isRequired && isEitherOr) { value += requiredValue && eitherOrValue ? requiredValue + eitherOrValue : 0; } else if (isExcluded && isEitherOr) { value += !excludedValue && eitherOrValue ? eitherOrValue : 0; } else if (isRequired) { value += requiredValue; } else if (isEitherOr) { value += eitherOrValue; } else { // do nothing this is a bad search and warrants no results } if (value > 0) { string lowerDescription = string(Page::getDescription().length(), ' '); for (unsigned index = 0; index < lowerDescription.length(); index++) { lowerDescription[index] = tolower( Page::getDescription()[index]); } for (unsigned index0 = 0; index0 < required.size(); index0++) { if (required[index0].find("URL ") == 0) { string fred = required[index0].substr(4); value += find(fred, lowerDescription, occurrencesDescription); } else if (required[index0].find("TITLE ") == 0) { string fred = required[index0].substr(6); value += find(fred, lowerDescription, occurrencesDescription); } else if (required[index0].find("TEXT ") == 0) { string fred = required[index0].substr(5); value += find(fred, lowerDescription, occurrencesDescription); } else { value += find(required[index0], lowerDescription, occurrencesDescription); } } for (unsigned index1 = 0; index1 < eitherOr.size(); index1++) { vector words; unsigned begin = 0, found; do { found = eitherOr[index1].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[index1].substr(begin, found - begin)); } else { words.push_back(eitherOr[index1].substr(begin)); } begin = found + 4; } while (begin < eitherOr[index1].length() && found != string::npos); for (unsigned number = 0; number < words.size(); number++) { if (words[index1].find("URL ") == 0) { string fred = words[index1].substr(4); value += find(fred, lowerDescription, occurrencesDescription); } else if (words[index1].find("TITLE ") == 0) { string fred = words[index1].substr(6); value += find(fred, lowerDescription, occurrencesDescription); } else if (words[index1].find("TEXT ") == 0) { string fred = words[index1].substr(5); value += find(fred, lowerDescription, occurrencesDescription); } else { value += find(words[index1], lowerDescription, occurrencesDescription); } } } for (unsigned index2 = 0; index2 < getHeadings().size(); index2++) { string lowerHeading = string(getHeadings()[index2].length(), ' '); for (unsigned number = 0; number < getHeadings()[index2].length(); number++) { lowerHeading[number] = tolower( getHeadings()[index2][number]); } for (unsigned number0 = 0; number0 < required.size(); number0++) { if (required[number0].find("URL ") == 0) { string fred = required[number0].substr(4); value += find(fred, lowerHeading); } else if (required[number0].find("TITLE ") == 0) { string fred = required[number0].substr(6); value += find(fred, lowerHeading); } else if (required[number0].find("TEXT ") == 0) { string fred = required[number0].substr(5); value += find(fred, lowerHeading); } else { value += find(required[number0], lowerHeading); } } for (unsigned number1 = 0; number1 < eitherOr.size(); number1++) { vector words; unsigned begin = 0, found; do { found = eitherOr[number1].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[number1].substr(begin, found - begin)); } else { words.push_back(eitherOr[number1].substr(begin)); } begin = found + 4; } while (begin < eitherOr[number1].length() && found != string::npos); for (unsigned number = 0; number < words.size(); number++) { if (words[number].find("URL ") == 0) { string fred = words[number].substr(4); value += find(fred, lowerHeading); } else if (words[number].find("TITLE ") == 0) { string fred = words[number].substr(6); value += find(fred, lowerHeading); } else if (words[number].find("TEXT ") == 0) { string fred = words[number].substr(5); value += find(fred, lowerHeading); } else { value += find(words[number], lowerHeading); } } } } } } } void Ranker::checkRequired() { vector inURLs, inTitles, inTexts; for (unsigned index = 0; index < required.size(); index++) { unsigned inURL = 0, inTitle = 0, inText = 0; if (required[index].find("URL ") == 0) { string fred = required[index].substr(4); string martha = lowerURL.substr(7); inURL = find(fred, martha); if (inURL) { string fred = required[index].substr(4); inTitle = find(fred, lowerTitle, occurrencesTitle); string martha = required[index].substr(4); inText = find(martha, lowerText, occurrencesText); if (!inTitle) inTitle++; if (!inText) inText++; } } else if (required[index].find("TITLE ") == 0) { string fred = required[index].substr(6); inTitle = find(fred, lowerTitle, occurrencesTitle); if (inTitle) { string fred = required[index].substr(6); string martha = lowerURL.substr(7); inURL = find(fred, martha); string george = required[index].substr(6); inText = find(george, lowerText, occurrencesText); if (!inURL) inURL++; if (!inText) inText++; } } else if (required[index].find("TEXT ") == 0) { string fred = required[index].substr(5); inText = find(fred, lowerText, occurrencesText); if (inText) { string fred = required[index].substr(5); string martha = lowerURL.substr(7); inURL = find(fred, martha); string george = required[index].substr(5); inTitle = find(george, lowerTitle, occurrencesTitle); if (!inURL) inURL++; if (!inTitle) inTitle++; } } else { string fred = lowerURL.substr(7); inURL = find(required[index], fred); inTitle = find(required[index], lowerTitle, occurrencesTitle); inText = find(required[index], lowerText, occurrencesText); } inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); } unsigned inURL = evaluate(inURLs); unsigned inTitle = evaluate(inTitles); unsigned inText = evaluate(inTexts); requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } void Ranker::checkExcluded() { vector inURLs, inTitles, inTexts; for (unsigned index = 0; index < excluded.size(); index++) { unsigned inURL = 0, inTitle = 0, inText = 0; string fred = lowerURL.substr(7); inURL = find(excluded[index], fred); inTitle = find(excluded[index], lowerTitle); inText = find(excluded[index], lowerText); inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); } unsigned inURL = evaluate(inURLs); unsigned inTitle = evaluate(inTitles); unsigned inText = evaluate(inTexts); excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } void Ranker::checkEitherOr() { vector inURLs, inTitles, inTexts; for (unsigned index = 0; index < eitherOr.size(); index++) { vector inURLz, inTitlez, inTextz; unsigned inURL = 0, inTitle = 0, inText = 0; vector words; unsigned begin = 0, found; do { found = eitherOr[index].find(" OR ", begin); if (found != string::npos) { words.push_back(eitherOr[index].substr(begin, found - begin)); } else { words.push_back(eitherOr[index].substr(begin)); } begin = found + 4; } while (begin < eitherOr[index].length() && found != string::npos); for (unsigned number = 0; number < words.size(); number++) { unsigned inURL = 0, inTitle = 0, inText = 0; if (words[number].find("URL ") == 0) { string fred = words[number].substr(4); string martha = lowerURL.substr(7); inURL = find(fred, martha); if (inURL) { string fred = words[number].substr(4); inTitle = find(fred, lowerTitle, occurrencesTitle); string martha = words[number].substr(4); inText = find(martha, lowerText, occurrencesText); if (!inTitle) inTitle++; if (!inText) inText++; } } else if (words[number].find("TITLE ") == 0) { string fred = words[number].substr(6); inTitle = find(fred, lowerTitle, occurrencesTitle); if (inTitle) { string fred = words[number].substr(6); string martha = lowerURL.substr(7); inURL = find(fred, martha); string george = words[number].substr(6); inText = find(george, lowerText, occurrencesText); if (!inURL) inURL++; if (!inText) inText++; } } else if (words[number].find("TEXT ") == 0) { string fred = words[number].substr(5); inText = find(fred, lowerText, occurrencesText); if (inText) { string fred = words[number].substr(5); string martha = lowerURL.substr(7); inURL = find(fred, martha); string george = words[number].substr(5); inTitle = find(george, lowerTitle, occurrencesTitle); if (!inURL) inURL++; if (!inTitle) inTitle++; } } else { string fred = lowerURL.substr(7); inURL = find(words[number], fred); inTitle = find(words[number], lowerTitle, occurrencesTitle); inText = find(words[number], lowerText, occurrencesText); } inURLz.push_back(inURL); inTitlez.push_back(inTitle); inTextz.push_back(inText); } for (unsigned number0 = 0; number0 < inURLz.size(); number0++) { inURL += inURLz[number0]; } for (unsigned number1 = 0; number1 < inTitlez.size(); number1++) { inTitle += inTitlez[number1]; } for (unsigned number2 = 0; number2 < inTextz.size(); number2++) { inText += inTextz[number2]; } inURLs.push_back(inURL); inTitles.push_back(inTitle); inTexts.push_back(inText); inURLz.clear(); inTitlez.clear(); inTextz.clear(); words.clear(); } unsigned inURL = evaluate(inURLs); unsigned inTitle = evaluate(inTitles); unsigned inText = evaluate(inTexts); eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + inText : 0; } unsigned Ranker::find(string& word, string& where) { unsigned value = 0; decrap(word); if (word == "") { // this can happen if a word is all crap characters value++; } else if (word.find_first_of(" \n ") == string::npos) { unsigned begin = 0, found; do { found = where.find(word, begin); if (found != string::npos) { bool isBefore, isAfter, before = false, after = false; isBefore = found - 1 > 0; isAfter = found + word.length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + word.length()]) != 0; if (!before && !after) { value++; } } begin = found + word.length(); } while (found != string::npos && begin < where.length()); } else { value = phrase(word, where); } return value; } unsigned Ranker::find(string& word, string& where, map& occurrences) { unsigned value = 0; decrap(word); if (word == "") { // this can happen if a word is all crap characters value++; } else if (word.find_first_of(" \n ") == string::npos) { unsigned begin = 0, found; do { found = where.find(word, begin); if (found != string::npos) { bool isBefore, isAfter, before = false, after = false; isBefore = found - 1 > 0; isAfter = found + word.length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + word.length()]) != 0; if (!before && !after) { value++; occurrences.insert(pair(found, word.length())); } } begin = found + word.length(); } while (found != string::npos && begin < where.length()); } else { value = phrase(word, where, occurrences); } return value; } unsigned Ranker::phrase(string& phrase, string& where) { unsigned value = 0; vector words; unsigned begin = 0, space; do { space = phrase.find(' ', begin); words.push_back(phrase.substr(begin, space - begin)); begin = space + 1; } while (space != string::npos && begin < phrase.length()); begin = 0; unsigned counter = 0; do { value += this->phrase(words, 0, begin, true, where); } while (begin < where.length()); return value; } unsigned Ranker::phrase(string& phrase, string& where, map& occurrences) { unsigned value = 0; vector words; unsigned begin = 0, space; do { space = phrase.find(' ', begin); words.push_back(phrase.substr(begin, space - begin)); begin = space + 1; } while (space != string::npos && begin < phrase.length()); begin = 0; do { value += this->phrase(words, 0, begin, true, where, occurrences); } while (begin < where.length()); return value; } unsigned Ranker::phrase(vector& words, unsigned word, unsigned& begin, bool start, string& where) { unsigned value = 0; bool end = !(word + 1 < words.size()); unsigned found = where.find(words[word], begin); unsigned newBegin = found + words[word].length(); if (found != string::npos) { bool isBefore, isAfter, before = false, after = false; isBefore = found - 1 > 0; isAfter = found + words[word].length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; if (!before && !after) { bool between = true; if (!start) { for (unsigned index = begin + 1; index < found - 1; index++) { if (isalnum(where[index])) { between = false; break; } } } if (between) { if (end) { begin = newBegin; value = 1; } else { value = phrase(words, (word + 1), newBegin, false, where); } } } } if (start) { if (found != string::npos) { begin = newBegin; } else { begin = string::npos; } } return value; } unsigned Ranker::phrase(vector& words, unsigned word, unsigned& begin, bool start, string& where, map& occurrences) { unsigned value = 0; bool end = !(word + 1 < words.size()); unsigned found = where.find(words[word], begin); unsigned newBegin = found + words[word].length(); if (found != string::npos) { bool isBefore, isAfter, before = false, after = false; isBefore = found - 1 > 0; isAfter = found + words[word].length() < where.length(); if (isBefore) before = isalnum(where[found - 1]) != 0; if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; if (!before && !after) { bool between = true; if (!start) { for (unsigned index = begin + 1; index < found - 1; index++) { if (isalnum(where[index])) { between = false; break; } } } if (between) { occurrences.insert(pair(found, words[word].length())); if (end) { begin = newBegin; value = 1; } else { value = phrase(words, (word + 1), newBegin, false, where, occurrences); } } } } if (start) { if (found != string::npos) { begin = newBegin; } else { begin = string::npos; } } return value; } unsigned Ranker::evaluate(vector& ins) { unsigned in = 0; for (unsigned index = 0; index < ins.size(); index++) { if (ins[index] > 0) { in += ins[index]; } else { in = 0; break; } } return in; } void Ranker::decrap(string& crap) { unsigned begin = 0, found; do { // &, +, and # are not considered crap found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^_`{|}~", begin); if (found != string::npos) { crap[found] = ' '; } begin = found + 1; } while (found != string::npos && begin < crap.length()); normalize(crap); }