trunk/Search/Ranker.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine Ranker
//
// Douglas Thrift
//
// $Id$

#include "Ranker.hpp"

Ranker::Ranker(Page& page) : Page(page)
{
        value = 0;
        requiredValue = 0;
        excludedValue = 0;
        eitherOrValue = 0;
        allIn = all;
}

void Ranker::rank(vector<string> query)
{
        vector<string> prep;

        for (unsigned index = 0; index < query.size(); index++)
        {
                if (query[index] == "allintitle:" && index == 0)
                {
                        allIn = title;
                }
                else if (query[index] == "allinurl:" && index == 0)
                {
                        allIn = url;
                }
                else if (query[index] == "allintext:" && index == 0)
                {
                        allIn = text;
                }
                else if (query[index].find("site:") == 0 && query[index].size() > 5)
                {
                        site = query[index].substr(5);
                }
                else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
                {
                        prep.push_back("TITLE " + query[index].substr(8));
                }
                else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
                {
                        prep.push_back("URL " + query[index].substr(6));
                }
                else if (query[index].find("intext:") == 0 && query[index].size() > 7)
                {
                        prep.push_back("TEXT " + query[index].substr(7));
                }
                else
                {
                        prep.push_back(query[index]);
                }
        }

        if (prep.size() > 0)
        {
                bool or_ = false;
                for (unsigned index = 0; index < prep.size(); index++)
                {
                        bool exclude = false;
                        if (prep[index].find('+') == 0)
                        {
                                prep[index].erase(0, 1);
                        }
                        else if (prep[index].find('-') == 0)
                        {
                                exclude = true;
                                prep[index].erase(0, 1);
                        }

                        if (or_)
                        {
                                if (prep[index].find(" OR") == string::npos)
                                {
                                        or_ = false;
                                }

                                eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
                        }
                        else if (exclude)
                        {
                                excluded.push_back(prep[index]);
                        }
                        else if (prep[index].find(" OR") != string::npos)
                        {
                                or_ = true;
                                eitherOr.push_back(prep[index]);
                        }
                        else
                        {
                                required.push_back(prep[index]);
                        }
                }
        }

        rank();
}

void Ranker::setSample()
{
        map<unsigned, unsigned>::iterator itor;
        multimap<unsigned, map<unsigned, unsigned>::iterator> distances;

        for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
        {
                unsigned distance;

                if (++itor != occurrencesText.end())
                {
                        unsigned next = itor->first;

                        itor--;
                        distance = next - (itor->first + itor->second);
                }
                else
                {
                        distance = string::npos;
                        itor--;
                }

                distances.insert(pair<unsigned, map<unsigned,
                        unsigned>::iterator>(distance, itor));
        }

        if (distances.begin() != distances.end())
        {
                itor = distances.begin()->second;
        }

        string portion;
        unsigned sampleLength = 0, begin = 0, end = string::npos;

        while (sampleLength < sampleMax && itor != occurrencesText.end())
        {
                unsigned found = itor->first, length = itor->second;

                for (unsigned index = found; index > begin; index--)
                {
                        if (found - index >= sampleMax - sampleLength - length)
                        {
                                for (; index < found; index++)
                                {
                                        if (isspace(getText()[index])) break;
                                }

                                begin = index + 1;

                                break;
                        }
                        else if ((index > begin ? (isupper(getText()[index]) &&
                                !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
                                index != found)
                        {
                                begin = index;

                                break;
                        }
                }

                if (end + 1 != begin) sample += " <strong>...</strong> ";

                portion = getText().substr(begin, found - begin);
                sampleLength += portion.length();

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                sample += portion + "<strong>";
                portion = getText().substr(found, length);
                sampleLength += portion.length();

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                sample += portion + "</strong>";
                begin = found + length;
                end = begin - 1;

                if (++itor != occurrencesText.end())
                {
                        if (itor->first + itor->second < begin + sampleMax - sampleLength)
                        {
                                portion = getText().substr(begin, itor->first - begin);
                                sampleLength += portion.length();

                                entities(portion, '&', "&amp;");
                                entities(portion, '\"', "&quot;");
                                entities(portion, '<', "&lt;");
                                entities(portion, '>', "&gt;");

                                sample += portion;
                                begin = itor->first;
                                end = begin - 1;
                        }
                        else
                        {
                                for (end = begin + sampleMax - sampleLength; end > begin;
                                        end--)
                                {
                                        if (isspace(getText()[end])) break;
                                }

                                portion = getText().substr(begin, end - begin + 1);
                                sampleLength += portion.length();

                                entities(portion, '&', "&amp;");
                                entities(portion, '\"', "&quot;");
                                entities(portion, '<', "&lt;");
                                entities(portion, '>', "&gt;");

                                sample += portion + " <strong>...</strong>";

                                break;
                        }
                }
                else
                {
                        for (end = begin + sampleMax - sampleLength; end > begin && (end +
                                1 < getText().length()); end--)
                        {
                                if (isspace(getText()[end])) break;
                        }

                        if (end >= getText().length()) end = getText().length() - 1;

                        portion = getText().substr(begin, end - begin + 1);
                        sampleLength += portion.length();

                        entities(portion, '&', "&amp;");
                        entities(portion, '\"', "&quot;");
                        entities(portion, '<', "&lt;");
                        entities(portion, '>', "&gt;");

                        sample += portion;

                        if (end + 1 < getText().length())
                        {
                                sample += " <strong>...</strong>";
                        }

                        break;
                }
        }

        if (sample == "")
        {
                for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--)
                {
                        if (isspace(getText()[end])) break;
                }

                sample = getText().substr(0, end + 1);

                entities(sample, '&', "&amp;");
                entities(sample, '\"', "&quot;");
                entities(sample, '<', "&lt;");
                entities(sample, '>', "&gt;");

                if (end + 1 < getText().length())
                {
                        sample += " <strong>...</strong>";
                }
        }
}

string Ranker::getTitle()
{
        string title, portion;

        unsigned begin = 0;
        for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
                itor != occurrencesTitle.end(); itor++)
        {
                unsigned found = itor->first;
                unsigned length = itor->second;

                portion = Page::getTitle().substr(begin, found - begin);

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                title += portion + "<strong>";

                portion = Page::getTitle().substr(found, length);

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                title += portion + "</strong>";

                begin = found + length;
        }

        portion = Page::getTitle().substr(begin);

        entities(portion, '&', "&amp;");
        entities(portion, '\"', "&quot;");
        entities(portion, '<', "&lt;");
        entities(portion, '>', "&gt;");

        title += portion;

        return title;
}

string Ranker::getDescription()
{
        string description, portion;

        unsigned begin = 0;
        for (map<unsigned, unsigned>::iterator itor =
                occurrencesDescription.begin(); itor != occurrencesDescription.end();
                itor++)
        {
                unsigned found = itor->first;
                unsigned length = itor->second;

                portion = Page::getDescription().substr(begin, found - begin);

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                description += portion + "<strong>";

                portion = Page::getDescription().substr(found, length);

                entities(portion, '&', "&amp;");
                entities(portion, '\"', "&quot;");
                entities(portion, '<', "&lt;");
                entities(portion, '>', "&gt;");

                description += portion + "</strong>";

                begin = found + length;
        }

        portion = Page::getDescription().substr(begin);

        entities(portion, '&', "&amp;");
        entities(portion, '\"', "&quot;");
        entities(portion, '<', "&lt;");
        entities(portion, '>', "&gt;");

        description += portion;

        return description;
}

bool Ranker::operator==(const unsigned number) const
{
        return value == number;
}

bool Ranker::operator==(const Ranker& ranker) const
{
        return value == ranker.value;
}

bool Ranker::operator!=(const unsigned number) const
{
        return value != number;
}

bool Ranker::operator!=(const Ranker& ranker) const
{
        return value != ranker.value;
}

bool Ranker::operator<(const unsigned number) const
{
        return value < number;
}

bool Ranker::operator<(const Ranker& ranker) const
{
        return value < ranker.value;
}

bool Ranker::operator>(const unsigned number) const
{
        return value > number;
}

bool Ranker::operator >(const Ranker& ranker) const
{
        return value > ranker.value;
}

void Ranker::rank()
{
        lowerAddress = tolower(getAddress());

        if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
                site.length())
        {
                bool isRequired = required.size() > 0;
                bool isExcluded = excluded.size() > 0;
                bool isEitherOr = eitherOr.size() > 0;

                lowerURL = tolower(getURL());
                lowerTitle = tolower(Page::getTitle());
                lowerText = tolower(Page::getText());

                if (isRequired) checkRequired();
                if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
                if (isEitherOr) checkEitherOr();

                if (isRequired && isExcluded && isEitherOr)
                {
                        value += requiredValue && !excludedValue && eitherOrValue ?
                                requiredValue + eitherOrValue : 0;
                }
                else if (isRequired && isExcluded)
                {
                        value += requiredValue && !excludedValue ? requiredValue : 0;
                }
                else if (isRequired && isEitherOr)
                {
                        value += requiredValue && eitherOrValue ? requiredValue +
                                eitherOrValue : 0;
                }
                else if (isExcluded && isEitherOr)
                {
                        value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
                }
                else if (isRequired)
                {
                        value += requiredValue;
                }
                else if (isEitherOr)
                {
                        value += eitherOrValue;
                }
                else
                {
                        // do nothing this is a bad search and warrants no results
                }

                if (value > 0)
                {
                        string lowerDescription = tolower(Page::getDescription());

                        for (unsigned index = 0; index < required.size(); index++)
                        {
                                if (required[index].find("URL ") == 0)
                                {
                                        value += find(required[index].substr(4), lowerDescription,
                                                occurrencesDescription);
                                }
                                else if (required[index].find("TITLE ") == 0)
                                {
                                        value += find(required[index].substr(6), lowerDescription,
                                                occurrencesDescription);
                                }
                                else if (required[index].find("TEXT ") == 0)
                                {
                                        value += find(required[index].substr(5), lowerDescription,
                                                occurrencesDescription);
                                }
                                else
                                {
                                        value += find(required[index], lowerDescription,
                                                occurrencesDescription);
                                }
                        }

                        for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
                        {
                                vector<string> words;

                                unsigned begin = 0, found;
                                do
                                {
                                        found = eitherOr[index1].find(" OR ", begin);

                                        if (found != string::npos)
                                        {
                                                words.push_back(eitherOr[index1].substr(begin, found -
                                                        begin));
                                        }
                                        else
                                        {
                                                words.push_back(eitherOr[index1].substr(begin));
                                        }

                                        begin = found + 4;
                                }
                                while (begin < eitherOr[index1].length() && found !=
                                        string::npos);

                                for (unsigned number = 0; number < words.size(); number++)
                                {
                                        if (words[index1].find("URL ") == 0)
                                        {
                                                value += find(words[index1].substr(4),
                                                        lowerDescription, occurrencesDescription);
                                        }
                                        else if (words[index1].find("TITLE ") == 0)
                                        {
                                                value += find(words[index1].substr(6),
                                                        lowerDescription, occurrencesDescription);
                                        }
                                        else if (words[index1].find("TEXT ") == 0)
                                        {
                                                value += find(words[index1].substr(5),
                                                        lowerDescription, occurrencesDescription);
                                        }
                                        else
                                        {
                                                value += find(words[index1], lowerDescription,
                                                        occurrencesDescription);
                                        }
                                }
                        }

                        for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
                        {
                                string lowerHeading = string(getHeadings()[index2].length(),
                                        ' ');
                                for (unsigned number = 0; number <
                                        getHeadings()[index2].length(); number++)
                                {
                                        lowerHeading[number] = tolower(
                                                getHeadings()[index2][number]);
                                }

                                for (unsigned number0 = 0; number0 < required.size(); number0++)
                                {
                                        if (required[number0].find("URL ") == 0)
                                        {
                                                value += find(required[number0].substr(4),
                                                        lowerHeading);
                                        }
                                        else if (required[number0].find("TITLE ") == 0)
                                        {
                                                value += find(required[number0].substr(6),
                                                        lowerHeading);
                                        }
                                        else if (required[number0].find("TEXT ") == 0)
                                        {
                                                value += find(required[number0].substr(5),
                                                        lowerHeading);
                                        }
                                        else
                                        {
                                                value += find(required[number0], lowerHeading);
                                        }
                                }

                                for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
                                {
                                        vector<string> words;

                                        unsigned begin = 0, found;
                                        do
                                        {
                                                found = eitherOr[number1].find(" OR ", begin);

                                                if (found != string::npos)
                                                {
                                                        words.push_back(eitherOr[number1].substr(begin,
                                                                found - begin));
                                                }
                                                else
                                                {
                                                        words.push_back(eitherOr[number1].substr(begin));
                                                }

                                                begin = found + 4;
                                        }
                                        while (begin < eitherOr[number1].length() && found !=
                                                string::npos);

                                        for (unsigned number = 0; number < words.size(); number++)
                                        {
                                                if (words[number].find("URL ") == 0)
                                                {
                                                        value += find(words[number].substr(4),
                                                                lowerHeading);
                                                }
                                                else if (words[number].find("TITLE ") == 0)
                                                {
                                                        value += find(words[number].substr(6),
                                                                lowerHeading);
                                                }
                                                else if (words[number].find("TEXT ") == 0)
                                                {
                                                        value += find(words[number].substr(5),
                                                                lowerHeading);
                                                }
                                                else
                                                {
                                                        value += find(words[number], lowerHeading);
                                                }
                                        }
                                }
                        }
                }
        }
}

void Ranker::checkRequired()
{
        vector<unsigned> inURLs, inTitles, inTexts;

        for (unsigned index = 0; index < required.size(); index++)
        {
                unsigned inURL = 0, inTitle = 0, inText = 0;

                if (required[index].find("URL ") == 0)
                {
                        inURL = find(required[index].substr(4), lowerURL.substr(7));

                        if (inURL)
                        {
                                inTitle = find(required[index].substr(4), lowerTitle,
                                        occurrencesTitle);
                                inText = find(required[index].substr(4), lowerText,
                                        occurrencesText);

                                if (!inTitle) inTitle++;
                                if (!inText) inText++;
                        }
                }
                else if (required[index].find("TITLE ") == 0)
                {
                        inTitle = find(required[index].substr(6), lowerTitle,
                                occurrencesTitle);

                        if (inTitle)
                        {
                                inURL = find(required[index].substr(6), lowerURL.substr(7));
                                inText = find(required[index].substr(6), lowerText,
                                        occurrencesText);

                                if (!inURL) inURL++;
                                if (!inText) inText++;
                        }
                }
                else if (required[index].find("TEXT ") == 0)
                {
                        inText = find(required[index].substr(5), lowerText,
                                occurrencesText);

                        if (inText)
                        {
                                inURL = find(required[index].substr(5), lowerURL.substr(7));
                                inTitle = find(required[index].substr(5), lowerTitle,
                                        occurrencesTitle);

                                if (!inURL) inURL++;
                                if (!inTitle) inTitle++;
                        }
                }
                else
                {
                        inURL = find(required[index], lowerURL.substr(7));
                        inTitle = find(required[index], lowerTitle, occurrencesTitle);
                        inText = find(required[index], lowerText, occurrencesText);
                }

                inURLs.push_back(inURL);
                inTitles.push_back(inTitle);
                inTexts.push_back(inText);
        }

        unsigned inURL = evaluate(inURLs);
        unsigned inTitle = evaluate(inTitles);
        unsigned inText = evaluate(inTexts);

        requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
                || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
                inText : 0;
}

void Ranker::checkExcluded()
{
        vector<unsigned> inURLs, inTitles, inTexts;

        for (unsigned index = 0; index < excluded.size(); index++)
        {
                unsigned inURL = 0, inTitle = 0, inText = 0;

                inURL = find(excluded[index], lowerURL.substr(7));
                inTitle = find(excluded[index], lowerTitle);
                inText = find(excluded[index], lowerText);

                inURLs.push_back(inURL);
                inTitles.push_back(inTitle);
                inTexts.push_back(inText);
        }

        unsigned inURL = evaluate(inURLs);
        unsigned inTitle = evaluate(inTitles);
        unsigned inText = evaluate(inTexts);

        excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
                || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
                inText : 0;
}

void Ranker::checkEitherOr()
{
        vector<unsigned> inURLs, inTitles, inTexts;

        for (unsigned index = 0; index < eitherOr.size(); index++)
        {
                vector<unsigned> inURLz, inTitlez, inTextz;
                unsigned inURL = 0, inTitle = 0, inText = 0;
                vector<string> words;

                unsigned begin = 0, found;
                do
                {
                        found = eitherOr[index].find(" OR ", begin);

                        if (found != string::npos)
                        {
                                words.push_back(eitherOr[index].substr(begin, found - begin));
                        }
                        else
                        {
                                words.push_back(eitherOr[index].substr(begin));
                        }

                        begin = found + 4;
                }
                while (begin < eitherOr[index].length() && found != string::npos);

                for (unsigned number = 0; number < words.size(); number++)
                {
                        unsigned inURL = 0, inTitle = 0, inText = 0;

                        if (words[number].find("URL ") == 0)
                        {
                                inURL = find(words[number].substr(4), lowerURL.substr(7));

                                if (inURL)
                                {
                                        inTitle = find(words[number].substr(4), lowerTitle,
                                                occurrencesTitle);
                                        inText = find(words[number].substr(4), lowerText,
                                                occurrencesText);

                                        if (!inTitle) inTitle++;
                                        if (!inText) inText++;
                                }
                        }
                        else if (words[number].find("TITLE ") == 0)
                        {
                                inTitle = find(words[number].substr(6), lowerTitle,
                                        occurrencesTitle);

                                if (inTitle)
                                {
                                        inURL = find(words[number].substr(6), lowerURL.substr(7));
                                        inText = find(words[number].substr(6), lowerText,
                                                occurrencesText);

                                        if (!inURL) inURL++;
                                        if (!inText) inText++;
                                }
                        }
                        else if (words[number].find("TEXT ") == 0)
                        {
                                inText = find(words[number].substr(5), lowerText,
                                        occurrencesText);

                                if (inText)
                                {
                                        inURL = find(words[number].substr(5), lowerURL.substr(7));
                                        inTitle = find(words[number].substr(5), lowerTitle,
                                                occurrencesTitle);

                                        if (!inURL) inURL++;
                                        if (!inTitle) inTitle++;
                                }
                        }
                        else
                        {
                                inURL = find(words[number], lowerURL.substr(7));
                                inTitle = find(words[number], lowerTitle, occurrencesTitle);
                                inText = find(words[number], lowerText, occurrencesText);
                        }

                        inURLz.push_back(inURL);
                        inTitlez.push_back(inTitle);
                        inTextz.push_back(inText);
                }

                for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
                {
                        inURL += inURLz[number0];
                }

                for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
                {
                        inTitle += inTitlez[number1];
                }

                for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
                {
                        inText += inTextz[number2];
                }

                inURLs.push_back(inURL);
                inTitles.push_back(inTitle);
                inTexts.push_back(inText);

                inURLz.clear();
                inTitlez.clear();
                inTextz.clear();
                words.clear();
        }

        unsigned inURL = evaluate(inURLs);
        unsigned inTitle = evaluate(inTitles);
        unsigned inText = evaluate(inTexts);

        eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
                || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
                inText : 0;
}

unsigned Ranker::find(string word, const string& where)
{
        unsigned value = 0;

        decrap(word);

        if (word == "")
        {
                // this can happen if a word is all crap characters
                value++;
        }
        else if (word.find_first_of(" \n        ") == string::npos)
        {
                unsigned begin = 0, found;
                do
                {
                        found = where.find(word, begin);

                        if (found != string::npos)
                        {
                                bool isBefore, isAfter, before = false, after = false;
                                isBefore = found - 1 > 0;
                                isAfter = found + word.length() < where.length();

                                if (isBefore) before = isalnum(where[found - 1]) != 0;
                                if (isAfter) after = isalnum(where[found + word.length()]) != 0;

                                if (!before && !after)
                                {
                                        value++;
                                }
                        }

                        begin = found + word.length();
                }
                while (found != string::npos && begin < where.length());
        }
        else
        {
                value = phrase(word, where);
        }

        return value;
}

unsigned Ranker::find(string word, const string& where, map<unsigned,
        unsigned>& occurrences)
{
        unsigned value = 0;

        decrap(word);

        if (word == "")
        {
                // this can happen if a word is all crap characters
                value++;
        }
        else if (word.find_first_of(" \n        ") == string::npos)
        {
                unsigned begin = 0, found;
                do
                {
                        found = where.find(word, begin);

                        if (found != string::npos)
                        {
                                bool isBefore, isAfter, before = false, after = false;
                                isBefore = found - 1 > 0;
                                isAfter = found + word.length() < where.length();

                                if (isBefore) before = isalnum(where[found - 1]) != 0;
                                if (isAfter) after = isalnum(where[found + word.length()]) != 0;

                                if (!before && !after)
                                {
                                        value++;

                                        occurrences.insert(pair<unsigned, unsigned>(found,
                                                word.length()));
                                }
                        }

                        begin = found + word.length();
                }
                while (found != string::npos && begin < where.length());
        }
        else
        {
                value = phrase(word, where, occurrences);
        }

        return value;
}

unsigned Ranker::phrase(const string& phrase, const string& where)
{
        unsigned value = 0;
        vector<string> words;

        unsigned begin = 0, space;
        do
        {
                space = phrase.find(' ', begin);

                words.push_back(phrase.substr(begin, space - begin));

                begin = space + 1;
        }
        while (space != string::npos && begin < phrase.length());

        begin = 0;
        unsigned counter = 0;
        do
        {
                value += this->phrase(words, 0, begin, true, where);
        }
        while (begin < where.length());

        return value;
}

unsigned Ranker::phrase(const string& phrase, const string& where,
        map<unsigned, unsigned>& occurrences)
{
        unsigned value = 0;
        vector<string> words;

        unsigned begin = 0, space;
        do
        {
                space = phrase.find(' ', begin);

                words.push_back(phrase.substr(begin, space - begin));

                begin = space + 1;
        }
        while (space != string::npos && begin < phrase.length());

        begin = 0;
        do
        {
                value += this->phrase(words, 0, begin, true, where, occurrences);
        }
        while (begin < where.length());

        return value;
}

unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
        begin, bool start, const string& where)
{
        unsigned value = 0;
        bool end = !(word + 1 < words.size());
        unsigned found = where.find(words[word], begin);
        unsigned newBegin = found + words[word].length();

        if (found != string::npos)
        {
                bool isBefore, isAfter, before = false, after = false;
                isBefore = found - 1 > 0;
                isAfter = found + words[word].length() < where.length();

                if (isBefore) before = isalnum(where[found - 1]) != 0;
                if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;

                if (!before && !after)
                {
                        bool between = true;
                        if (!start)
                        {
                                for (unsigned index = begin + 1; index < found - 1; index++)
                                {
                                        if (isalnum(where[index]))
                                        {
                                                between = false;
                                                break;
                                        }
                                }
                        }

                        if (between)
                        {
                                if (end)
                                {
                                        begin = newBegin;
                                        value = 1;
                                }
                                else
                                {
                                        value = phrase(words, (word + 1), newBegin, false, where);
                                }
                        }
                }
        }

        if (start)
        {
                if (found != string::npos)
                {
                        begin = newBegin;
                }
                else
                {
                        begin = string::npos;
                }
        }

        return value;
}

unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
        begin, bool start, const string& where, map<unsigned, unsigned>&
        occurrences)
{
        unsigned value = 0;
        bool end = !(word + 1 < words.size());
        unsigned found = where.find(words[word], begin);
        unsigned newBegin = found + words[word].length();

        if (found != string::npos)
        {
                bool isBefore, isAfter, before = false, after = false;
                isBefore = found - 1 > 0;
                isAfter = found + words[word].length() < where.length();

                if (isBefore) before = isalnum(where[found - 1]) != 0;
                if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;

                if (!before && !after)
                {
                        bool between = true;
                        if (!start)
                        {
                                for (unsigned index = begin + 1; index < found - 1; index++)
                                {
                                        if (isalnum(where[index]))
                                        {
                                                between = false;
                                                break;
                                        }
                                }
                        }

                        if (between)
                        {
                                occurrences.insert(pair<unsigned, unsigned>(found,
                                        words[word].length()));

                                if (end)
                                {
                                        begin = newBegin;
                                        value = 1;
                                }
                                else
                                {
                                        value = phrase(words, (word + 1), newBegin, false, where,
                                                occurrences);
                                }
                        }
                }
        }

        if (start)
        {
                if (found != string::npos)
                {
                        begin = newBegin;
                }
                else
                {
                        begin = string::npos;
                }
        }

        return value;
}

unsigned Ranker::evaluate(vector<unsigned>& ins)
{
        unsigned in = 0;

        for (unsigned index = 0; index < ins.size(); index++)
        {
                if (ins[index] > 0)
                {
                        in += ins[index];
                }
                else
                {
                        in = 0;
                        break;
                }
        }

        return in;
}

void Ranker::decrap(string& crap)
{
        unsigned begin = 0, found;
        do
        {
                // &, _, +, and # are not considered crap
                found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);

                if (found != string::npos)
                {
                        crap[found] = ' ';
                }

                begin = found + 1;
        }
        while (found != string::npos && begin < crap.length());

        normalize(crap);
}
Revision:	341
Committed:	2004-04-20T13:20:56-07:00 (21 years, 2 months ago) by Douglas Thrift
File size:	27192 byte(s)
Log Message:	Fixed sample output so it's not empty ever.