ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 1
Committed: 2002-12-04T20:22:59-08:00 (22 years, 6 months ago) by douglas
File size: 28617 byte(s)
Log Message:
Initial revision

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Ranker
46     //
47     // Douglas Thrift
48     //
49     // Ranker.cpp
50    
51     #include "Ranker.h"
52    
53     Ranker::Ranker()
54     {
55     value = 0;
56     requiredValue = 0;
57     excludedValue = 0;
58     eitherOrValue = 0;
59     allIn = all;
60     }
61    
62     Ranker::Ranker(Page& page) : Page(page)
63     {
64     value = 0;
65     requiredValue = 0;
66     excludedValue = 0;
67     eitherOrValue = 0;
68     allIn = all;
69     }
70    
71     void Ranker::rank(vector<string> query)
72     {
73     vector<string> prep;
74    
75     for (unsigned index = 0; index < query.size(); index++)
76     {
77     if (query[index] == "allintitle:" && index == 0)
78     {
79     allIn = title;
80     }
81     else if (query[index] == "allinurl:" && index == 0)
82     {
83     allIn = url;
84     }
85     else if (query[index] == "allintext:" && index == 0)
86     {
87     allIn = text;
88     }
89     else if (query[index].find("site:") == 0 && query[index].size() > 5)
90     {
91     site = query[index].substr(5);
92     }
93     else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
94     {
95     prep.push_back("TITLE " + query[index].substr(8));
96     }
97     else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
98     {
99     prep.push_back("URL " + query[index].substr(6));
100     }
101     else if (query[index].find("intext:") == 0 && query[index].size() > 7)
102     {
103     prep.push_back("TEXT " + query[index].substr(7));
104     }
105     else
106     {
107     prep.push_back(query[index]);
108     }
109     }
110    
111     if (prep.size() > 0)
112     {
113     bool or_ = false;
114     for (unsigned index = 0; index < prep.size(); index++)
115     {
116     bool exclude = false;
117     if (prep[index].find('+') == 0)
118     {
119     prep[index].erase(0, 1);
120     }
121     else if (prep[index].find('-') == 0)
122     {
123     exclude = true;
124     prep[index].erase(0, 1);
125     }
126    
127     if (or_)
128     {
129     if (prep[index].find(" OR") == string::npos)
130     {
131     or_ = false;
132     }
133    
134     eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
135     }
136     else if (exclude)
137     {
138     excluded.push_back(prep[index]);
139     }
140     else if (prep[index].find(" OR") != string::npos)
141     {
142     or_ = true;
143     eitherOr.push_back(prep[index]);
144     }
145     else
146     {
147     required.push_back(prep[index]);
148     }
149     }
150     }
151    
152     rank();
153     }
154    
155     void Ranker::setSample()
156     {
157     map<unsigned, unsigned>::iterator itor;
158    
159     multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
160    
161     for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
162     {
163     unsigned distance;
164    
165     if (++itor != occurrencesText.end())
166     {
167     unsigned next = itor->first;
168     itor--;
169    
170     distance = next - (itor->first + itor->second);
171     }
172     else
173     {
174     distance = UINT_MAX;
175     itor--;
176     }
177    
178     distances.insert(pair<unsigned, map<unsigned,
179     unsigned>::iterator>(distance, itor));
180     }
181    
182     if (distances.begin() != distances.end())
183     {
184     itor = distances.begin()->second;
185     }
186    
187     string portion;
188     unsigned sampleLength = 0, begin = 0, end = string::npos;
189     while (sampleLength < 160 && itor != occurrencesText.end())
190     {
191     unsigned found = itor->first;
192     unsigned length = itor->second;
193    
194     for (unsigned index = found; index > begin; index--)
195     {
196     if (index == begin) cerr << "Oh crap, I'm insane!\n";
197     if (found - index >= 160 - sampleLength - length)
198     {
199     for (; index < found; index++)
200     {
201     if (isspace(getText()[index])) break;
202     }
203     begin = index + 1;
204     break;
205     }
206     else if ((index > begin ? (isupper(getText()[index]) &&
207     !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
208     index != found)
209     {
210     begin = index;
211     break;
212     }
213     }
214    
215     if (end + 1 != begin) sample += " <strong>...</strong> ";
216    
217     portion = getText().substr(begin, found - begin);
218     sampleLength += portion.length();
219    
220     entities(portion, '&', "&amp;");
221     entities(portion, '\"', "&quot;");
222     entities(portion, '<', "&lt;");
223     entities(portion, '>', "&gt;");
224    
225     sample += portion + "<strong>";
226    
227     portion = getText().substr(found, length);
228     sampleLength += portion.length();
229    
230     entities(portion, '&', "&amp;");
231     entities(portion, '\"', "&quot;");
232     entities(portion, '<', "&lt;");
233     entities(portion, '>', "&gt;");
234    
235     sample += portion + "</strong>";
236    
237     begin = found + length;
238     end = begin - 1;
239    
240     if (++itor != occurrencesText.end())
241     {
242     if (itor->first + itor->second < begin + 160 - sampleLength)
243     {
244     portion = getText().substr(begin, itor->first - begin);
245     sampleLength += portion.length();
246    
247     entities(portion, '&', "&amp;");
248     entities(portion, '\"', "&quot;");
249     entities(portion, '<', "&lt;");
250     entities(portion, '>', "&gt;");
251    
252     sample += portion;
253    
254     begin = itor->first;
255     end = begin - 1;
256     }
257     else
258     {
259     for (end = begin + 160 - sampleLength; end > begin; end--)
260     {
261     if (isspace(getText()[end])) break;
262     }
263    
264     portion = getText().substr(begin, end - begin + 1);
265     sampleLength += portion.length();
266    
267     entities(portion, '&', "&amp;");
268     entities(portion, '\"', "&quot;");
269     entities(portion, '<', "&lt;");
270     entities(portion, '>', "&gt;");
271    
272     sample += portion + " <strong>...</strong>";
273    
274     break;
275     }
276     }
277     else
278     {
279     for (end = begin + 160 - sampleLength; end > begin && (end + 1 <
280     getText().length()); end--)
281     {
282     if (isspace(getText()[end])) break;
283     }
284    
285     if (end >= getText().length()) end = getText().length() - 1;
286    
287     portion = getText().substr(begin, end - begin + 1);
288     sampleLength += portion.length();
289    
290     entities(portion, '&', "&amp;");
291     entities(portion, '\"', "&quot;");
292     entities(portion, '<', "&lt;");
293     entities(portion, '>', "&gt;");
294    
295     sample += portion;
296    
297     if (end + 1 < getText().length())
298     {
299     sample += " <strong>...</strong>";
300     }
301    
302     break;
303     }
304     }
305     }
306    
307     string Ranker::getTitle()
308     {
309     string title, portion;
310    
311     unsigned begin = 0;
312     for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
313     itor != occurrencesTitle.end(); itor++)
314     {
315     unsigned found = itor->first;
316     unsigned length = itor->second;
317    
318     portion = Page::getTitle().substr(begin, found - begin);
319    
320     entities(portion, '&', "&amp;");
321     entities(portion, '\"', "&quot;");
322     entities(portion, '<', "&lt;");
323     entities(portion, '>', "&gt;");
324    
325     title += portion + "<strong>";
326    
327     portion = Page::getTitle().substr(found, length);
328    
329     entities(portion, '&', "&amp;");
330     entities(portion, '\"', "&quot;");
331     entities(portion, '<', "&lt;");
332     entities(portion, '>', "&gt;");
333    
334     title += portion + "</strong>";
335    
336     begin = found + length;
337     }
338    
339     portion = Page::getTitle().substr(begin);
340    
341     entities(portion, '&', "&amp;");
342     entities(portion, '\"', "&quot;");
343     entities(portion, '<', "&lt;");
344     entities(portion, '>', "&gt;");
345    
346     title += portion;
347    
348     return title;
349     }
350    
351     string Ranker::getDescription()
352     {
353     string description, portion;
354    
355     unsigned begin = 0;
356     for (map<unsigned, unsigned>::iterator itor =
357     occurrencesDescription.begin(); itor != occurrencesDescription.end();
358     itor++)
359     {
360     unsigned found = itor->first;
361     unsigned length = itor->second;
362    
363     portion = Page::getDescription().substr(begin, found - begin);
364    
365     entities(portion, '&', "&amp;");
366     entities(portion, '\"', "&quot;");
367     entities(portion, '<', "&lt;");
368     entities(portion, '>', "&gt;");
369    
370     description += portion + "<strong>";
371    
372     portion = Page::getDescription().substr(found, length);
373    
374     entities(portion, '&', "&amp;");
375     entities(portion, '\"', "&quot;");
376     entities(portion, '<', "&lt;");
377     entities(portion, '>', "&gt;");
378    
379     description += portion + "</strong>";
380    
381     begin = found + length;
382     }
383    
384     portion = Page::getDescription().substr(begin);
385    
386     entities(portion, '&', "&amp;");
387     entities(portion, '\"', "&quot;");
388     entities(portion, '<', "&lt;");
389     entities(portion, '>', "&gt;");
390    
391     description += portion;
392    
393     return description;
394     }
395    
396     bool Ranker::operator==(const unsigned number) const
397     {
398     return value == number;
399     }
400    
401     bool Ranker::operator==(const Ranker& ranker) const
402     {
403     return value == ranker.value;
404     }
405    
406     bool Ranker::operator!=(const unsigned number) const
407     {
408     return value != number;
409     }
410    
411     bool Ranker::operator!=(const Ranker& ranker) const
412     {
413     return value != ranker.value;
414     }
415    
416     bool Ranker::operator<(const unsigned number) const
417     {
418     return value < number;
419     }
420    
421     bool Ranker::operator<(const Ranker& ranker) const
422     {
423     return value < ranker.value;
424     }
425    
426     bool Ranker::operator>(const unsigned number) const
427     {
428     return value > number;
429     }
430    
431     bool Ranker::operator >(const Ranker& ranker) const
432     {
433     return value > ranker.value;
434     }
435    
436     void Ranker::rank()
437     {
438     lowerAddress = string(getAddress().length(), ' ');
439     for (unsigned index = 0; index < lowerAddress.length(); index++)
440     {
441     lowerAddress[index] = tolower(getAddress()[index]);
442     }
443    
444     if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
445     site.length())
446     {
447     bool isRequired = required.size() > 0;
448     bool isExcluded = excluded.size() > 0;
449     bool isEitherOr = eitherOr.size() > 0;
450    
451     lowerURL = string(getURL().length(), ' ');
452     for (unsigned index = 0; index < lowerURL.length(); index++)
453     {
454     lowerURL[index] = tolower(getURL()[index]);
455     }
456    
457     lowerTitle = string(Page::getTitle().length(), ' ');
458     for (unsigned index0 = 0; index0 < lowerTitle.length(); index0++)
459     {
460     lowerTitle[index0] = tolower(Page::getTitle()[index0]);
461     }
462    
463     lowerText = string(Page::getText().length(), ' ');
464     for (unsigned index1 = 0; index1 < lowerText.length(); index1++)
465     {
466     lowerText[index1] = tolower(Page::getText()[index1]);
467     }
468    
469     if (isRequired) checkRequired();
470     if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
471     if (isEitherOr) checkEitherOr();
472    
473     if (isRequired && isExcluded && isEitherOr)
474     {
475     value += requiredValue && !excludedValue && eitherOrValue ?
476     requiredValue + eitherOrValue : 0;
477     }
478     else if (isRequired && isExcluded)
479     {
480     value += requiredValue && !excludedValue ? requiredValue : 0;
481     }
482     else if (isRequired && isEitherOr)
483     {
484     value += requiredValue && eitherOrValue ? requiredValue +
485     eitherOrValue : 0;
486     }
487     else if (isExcluded && isEitherOr)
488     {
489     value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
490     }
491     else if (isRequired)
492     {
493     value += requiredValue;
494     }
495     else if (isEitherOr)
496     {
497     value += eitherOrValue;
498     }
499     else
500     {
501     // do nothing this is a bad search and warrants no results
502     }
503    
504     if (value > 0)
505     {
506     string lowerDescription = string(Page::getDescription().length(),
507     ' ');
508     for (unsigned index = 0; index < lowerDescription.length(); index++)
509     {
510     lowerDescription[index] = tolower(
511     Page::getDescription()[index]);
512     }
513    
514     for (unsigned index0 = 0; index0 < required.size(); index0++)
515     {
516     if (required[index0].find("URL ") == 0)
517     {
518     string fred = required[index0].substr(4);
519     value += find(fred, lowerDescription,
520     occurrencesDescription);
521     }
522     else if (required[index0].find("TITLE ") == 0)
523     {
524     string fred = required[index0].substr(6);
525     value += find(fred, lowerDescription,
526     occurrencesDescription);
527     }
528     else if (required[index0].find("TEXT ") == 0)
529     {
530     string fred = required[index0].substr(5);
531     value += find(fred, lowerDescription,
532     occurrencesDescription);
533     }
534     else
535     {
536     value += find(required[index0], lowerDescription,
537     occurrencesDescription);
538     }
539     }
540    
541     for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
542     {
543     vector<string> words;
544    
545     unsigned begin = 0, found;
546     do
547     {
548     found = eitherOr[index1].find(" OR ", begin);
549    
550     if (found != string::npos)
551     {
552     words.push_back(eitherOr[index1].substr(begin, found -
553     begin));
554     }
555     else
556     {
557     words.push_back(eitherOr[index1].substr(begin));
558     }
559    
560     begin = found + 4;
561     }
562     while (begin < eitherOr[index1].length() && found !=
563     string::npos);
564    
565     for (unsigned number = 0; number < words.size(); number++)
566     {
567     if (words[index1].find("URL ") == 0)
568     {
569     string fred = words[index1].substr(4);
570     value += find(fred, lowerDescription,
571     occurrencesDescription);
572     }
573     else if (words[index1].find("TITLE ") == 0)
574     {
575     string fred = words[index1].substr(6);
576     value += find(fred, lowerDescription,
577     occurrencesDescription);
578     }
579     else if (words[index1].find("TEXT ") == 0)
580     {
581     string fred = words[index1].substr(5);
582     value += find(fred, lowerDescription,
583     occurrencesDescription);
584     }
585     else
586     {
587     value += find(words[index1], lowerDescription,
588     occurrencesDescription);
589     }
590     }
591     }
592    
593     for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
594     {
595     string lowerHeading = string(getHeadings()[index2].length(),
596     ' ');
597     for (unsigned number = 0; number <
598     getHeadings()[index2].length(); number++)
599     {
600     lowerHeading[number] = tolower(
601     getHeadings()[index2][number]);
602     }
603    
604     for (unsigned number0 = 0; number0 < required.size(); number0++)
605     {
606     if (required[number0].find("URL ") == 0)
607     {
608     string fred = required[number0].substr(4);
609     value += find(fred,
610     lowerHeading);
611     }
612     else if (required[number0].find("TITLE ") == 0)
613     {
614     string fred = required[number0].substr(6);
615     value += find(fred,
616     lowerHeading);
617     }
618     else if (required[number0].find("TEXT ") == 0)
619     {
620     string fred = required[number0].substr(5);
621     value += find(fred,
622     lowerHeading);
623     }
624     else
625     {
626     value += find(required[number0], lowerHeading);
627     }
628     }
629    
630     for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
631     {
632     vector<string> words;
633    
634     unsigned begin = 0, found;
635     do
636     {
637     found = eitherOr[number1].find(" OR ", begin);
638    
639     if (found != string::npos)
640     {
641     words.push_back(eitherOr[number1].substr(begin,
642     found - begin));
643     }
644     else
645     {
646     words.push_back(eitherOr[number1].substr(begin));
647     }
648    
649     begin = found + 4;
650     }
651     while (begin < eitherOr[number1].length() && found !=
652     string::npos);
653    
654     for (unsigned number = 0; number < words.size(); number++)
655     {
656     if (words[number].find("URL ") == 0)
657     {
658     string fred = words[number].substr(4);
659     value += find(fred,
660     lowerHeading);
661     }
662     else if (words[number].find("TITLE ") == 0)
663     {
664     string fred = words[number].substr(6);
665     value += find(fred,
666     lowerHeading);
667     }
668     else if (words[number].find("TEXT ") == 0)
669     {
670     string fred = words[number].substr(5);
671     value += find(fred,
672     lowerHeading);
673     }
674     else
675     {
676     value += find(words[number], lowerHeading);
677     }
678     }
679     }
680     }
681     }
682     }
683     }
684    
685     void Ranker::checkRequired()
686     {
687     vector<unsigned> inURLs, inTitles, inTexts;
688    
689     for (unsigned index = 0; index < required.size(); index++)
690     {
691     unsigned inURL = 0, inTitle = 0, inText = 0;
692    
693     if (required[index].find("URL ") == 0)
694     {
695     string fred = required[index].substr(4);
696     string martha = lowerURL.substr(7);
697     inURL = find(fred, martha);
698    
699     if (inURL)
700     {
701     string fred = required[index].substr(4);
702     inTitle = find(fred, lowerTitle,
703     occurrencesTitle);
704     string martha = required[index].substr(4);
705     inText = find(martha, lowerText,
706     occurrencesText);
707    
708     if (!inTitle) inTitle++;
709     if (!inText) inText++;
710     }
711     }
712     else if (required[index].find("TITLE ") == 0)
713     {
714     string fred = required[index].substr(6);
715     inTitle = find(fred, lowerTitle,
716     occurrencesTitle);
717    
718     if (inTitle)
719     {
720     string fred = required[index].substr(6);
721     string martha = lowerURL.substr(7);
722     inURL = find(fred, martha);
723     string george = required[index].substr(6);
724     inText = find(george, lowerText,
725     occurrencesText);
726    
727     if (!inURL) inURL++;
728     if (!inText) inText++;
729     }
730     }
731     else if (required[index].find("TEXT ") == 0)
732     {
733     string fred = required[index].substr(5);
734     inText = find(fred, lowerText,
735     occurrencesText);
736    
737     if (inText)
738     {
739     string fred = required[index].substr(5);
740     string martha = lowerURL.substr(7);
741     inURL = find(fred, martha);
742     string george = required[index].substr(5);
743     inTitle = find(george, lowerTitle,
744     occurrencesTitle);
745    
746     if (!inURL) inURL++;
747     if (!inTitle) inTitle++;
748     }
749     }
750     else
751     {
752     string fred = lowerURL.substr(7);
753     inURL = find(required[index], fred);
754     inTitle = find(required[index], lowerTitle, occurrencesTitle);
755     inText = find(required[index], lowerText, occurrencesText);
756     }
757    
758     inURLs.push_back(inURL);
759     inTitles.push_back(inTitle);
760     inTexts.push_back(inText);
761     }
762    
763     unsigned inURL = evaluate(inURLs);
764     unsigned inTitle = evaluate(inTitles);
765     unsigned inText = evaluate(inTexts);
766    
767     requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
768     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
769     inText : 0;
770     }
771    
772     void Ranker::checkExcluded()
773     {
774     vector<unsigned> inURLs, inTitles, inTexts;
775    
776     for (unsigned index = 0; index < excluded.size(); index++)
777     {
778     unsigned inURL = 0, inTitle = 0, inText = 0;
779    
780     string fred = lowerURL.substr(7);
781     inURL = find(excluded[index], fred);
782     inTitle = find(excluded[index], lowerTitle);
783     inText = find(excluded[index], lowerText);
784    
785     inURLs.push_back(inURL);
786     inTitles.push_back(inTitle);
787     inTexts.push_back(inText);
788     }
789    
790     unsigned inURL = evaluate(inURLs);
791     unsigned inTitle = evaluate(inTitles);
792     unsigned inText = evaluate(inTexts);
793    
794     excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
795     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
796     inText : 0;
797     }
798    
799     void Ranker::checkEitherOr()
800     {
801     vector<unsigned> inURLs, inTitles, inTexts;
802    
803     for (unsigned index = 0; index < eitherOr.size(); index++)
804     {
805     vector<unsigned> inURLz, inTitlez, inTextz;
806     unsigned inURL = 0, inTitle = 0, inText = 0;
807     vector<string> words;
808    
809     unsigned begin = 0, found;
810     do
811     {
812     found = eitherOr[index].find(" OR ", begin);
813    
814     if (found != string::npos)
815     {
816     words.push_back(eitherOr[index].substr(begin, found - begin));
817     }
818     else
819     {
820     words.push_back(eitherOr[index].substr(begin));
821     }
822    
823     begin = found + 4;
824     }
825     while (begin < eitherOr[index].length() && found != string::npos);
826    
827     for (unsigned number = 0; number < words.size(); number++)
828     {
829     unsigned inURL = 0, inTitle = 0, inText = 0;
830    
831     if (words[number].find("URL ") == 0)
832     {
833     string fred = words[number].substr(4);
834     string martha = lowerURL.substr(7);
835     inURL = find(fred, martha);
836    
837     if (inURL)
838     {
839     string fred = words[number].substr(4);
840     inTitle = find(fred, lowerTitle,
841     occurrencesTitle);
842     string martha = words[number].substr(4);
843     inText = find(martha, lowerText,
844     occurrencesText);
845    
846     if (!inTitle) inTitle++;
847     if (!inText) inText++;
848     }
849     }
850     else if (words[number].find("TITLE ") == 0)
851     {
852     string fred = words[number].substr(6);
853     inTitle = find(fred, lowerTitle,
854     occurrencesTitle);
855    
856     if (inTitle)
857     {
858     string fred = words[number].substr(6);
859     string martha = lowerURL.substr(7);
860     inURL = find(fred, martha);
861     string george = words[number].substr(6);
862     inText = find(george, lowerText,
863     occurrencesText);
864    
865     if (!inURL) inURL++;
866     if (!inText) inText++;
867     }
868     }
869     else if (words[number].find("TEXT ") == 0)
870     {
871     string fred = words[number].substr(5);
872     inText = find(fred, lowerText,
873     occurrencesText);
874    
875     if (inText)
876     {
877     string fred = words[number].substr(5);
878     string martha = lowerURL.substr(7);
879     inURL = find(fred, martha);
880     string george = words[number].substr(5);
881     inTitle = find(george, lowerTitle,
882     occurrencesTitle);
883    
884     if (!inURL) inURL++;
885     if (!inTitle) inTitle++;
886     }
887     }
888     else
889     {
890     string fred = lowerURL.substr(7);
891     inURL = find(words[number], fred);
892     inTitle = find(words[number], lowerTitle, occurrencesTitle);
893     inText = find(words[number], lowerText, occurrencesText);
894     }
895    
896     inURLz.push_back(inURL);
897     inTitlez.push_back(inTitle);
898     inTextz.push_back(inText);
899     }
900    
901     for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
902     {
903     inURL += inURLz[number0];
904     }
905    
906     for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
907     {
908     inTitle += inTitlez[number1];
909     }
910    
911     for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
912     {
913     inText += inTextz[number2];
914     }
915    
916     inURLs.push_back(inURL);
917     inTitles.push_back(inTitle);
918     inTexts.push_back(inText);
919    
920     inURLz.clear();
921     inTitlez.clear();
922     inTextz.clear();
923     words.clear();
924     }
925    
926     unsigned inURL = evaluate(inURLs);
927     unsigned inTitle = evaluate(inTitles);
928     unsigned inText = evaluate(inTexts);
929    
930     eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
931     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
932     inText : 0;
933     }
934    
935     unsigned Ranker::find(string& word, string& where)
936     {
937     unsigned value = 0;
938    
939     decrap(word);
940    
941     if (word == "")
942     {
943     // this can happen if a word is all crap characters
944     value++;
945     }
946     else if (word.find_first_of(" \n ") == string::npos)
947     {
948     unsigned begin = 0, found;
949     do
950     {
951     found = where.find(word, begin);
952    
953     if (found != string::npos)
954     {
955     bool isBefore, isAfter, before = false, after = false;
956     isBefore = found - 1 > 0;
957     isAfter = found + word.length() < where.length();
958    
959     if (isBefore) before = isalnum(where[found - 1]) != 0;
960     if (isAfter) after = isalnum(where[found + word.length()]) != 0;
961    
962     if (!before && !after)
963     {
964     value++;
965     }
966     }
967    
968     begin = found + word.length();
969     }
970     while (found != string::npos && begin < where.length());
971     }
972     else
973     {
974     value = phrase(word, where);
975     }
976    
977     return value;
978     }
979    
980     unsigned Ranker::find(string& word, string& where, map<unsigned, unsigned>&
981     occurrences)
982     {
983     unsigned value = 0;
984    
985     decrap(word);
986    
987     if (word == "")
988     {
989     // this can happen if a word is all crap characters
990     value++;
991     }
992     else if (word.find_first_of(" \n ") == string::npos)
993     {
994     unsigned begin = 0, found;
995     do
996     {
997     found = where.find(word, begin);
998    
999     if (found != string::npos)
1000     {
1001     bool isBefore, isAfter, before = false, after = false;
1002     isBefore = found - 1 > 0;
1003     isAfter = found + word.length() < where.length();
1004    
1005     if (isBefore) before = isalnum(where[found - 1]) != 0;
1006     if (isAfter) after = isalnum(where[found + word.length()]) != 0;
1007    
1008     if (!before && !after)
1009     {
1010     value++;
1011    
1012     occurrences.insert(pair<unsigned, unsigned>(found,
1013     word.length()));
1014     }
1015     }
1016    
1017     begin = found + word.length();
1018     }
1019     while (found != string::npos && begin < where.length());
1020     }
1021     else
1022     {
1023     value = phrase(word, where, occurrences);
1024     }
1025    
1026     return value;
1027     }
1028    
1029     unsigned Ranker::phrase(string& phrase, string& where)
1030     {
1031     unsigned value = 0;
1032     vector<string> words;
1033    
1034     unsigned begin = 0, space;
1035     do
1036     {
1037     space = phrase.find(' ', begin);
1038    
1039     words.push_back(phrase.substr(begin, space - begin));
1040    
1041     begin = space + 1;
1042     }
1043     while (space != string::npos && begin < phrase.length());
1044    
1045     begin = 0;
1046     unsigned counter = 0;
1047     do
1048     {
1049     value += this->phrase(words, 0, begin, true, where);
1050     }
1051     while (begin < where.length());
1052    
1053     return value;
1054     }
1055    
1056     unsigned Ranker::phrase(string& phrase, string& where, map<unsigned, unsigned>&
1057     occurrences)
1058     {
1059     unsigned value = 0;
1060     vector<string> words;
1061    
1062     unsigned begin = 0, space;
1063     do
1064     {
1065     space = phrase.find(' ', begin);
1066    
1067     words.push_back(phrase.substr(begin, space - begin));
1068    
1069     begin = space + 1;
1070     }
1071     while (space != string::npos && begin < phrase.length());
1072    
1073     begin = 0;
1074     do
1075     {
1076     value += this->phrase(words, 0, begin, true, where, occurrences);
1077     }
1078     while (begin < where.length());
1079    
1080     return value;
1081     }
1082    
1083     unsigned Ranker::phrase(vector<string>& words, unsigned word, unsigned& begin,
1084     bool start, string& where)
1085     {
1086     unsigned value = 0;
1087     bool end = !(word + 1 < words.size());
1088     unsigned found = where.find(words[word], begin);
1089     unsigned newBegin = found + words[word].length();
1090    
1091     if (found != string::npos)
1092     {
1093     bool isBefore, isAfter, before = false, after = false;
1094     isBefore = found - 1 > 0;
1095     isAfter = found + words[word].length() < where.length();
1096    
1097     if (isBefore) before = isalnum(where[found - 1]) != 0;
1098     if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1099    
1100     if (!before && !after)
1101     {
1102     bool between = true;
1103     if (!start)
1104     {
1105     for (unsigned index = begin + 1; index < found - 1; index++)
1106     {
1107     if (isalnum(where[index]))
1108     {
1109     between = false;
1110     break;
1111     }
1112     }
1113     }
1114    
1115     if (between)
1116     {
1117     if (end)
1118     {
1119     begin = newBegin;
1120     value = 1;
1121     }
1122     else
1123     {
1124     value = phrase(words, (word + 1), newBegin, false, where);
1125     }
1126     }
1127     }
1128     }
1129    
1130     if (start)
1131     {
1132     if (found != string::npos)
1133     {
1134     begin = newBegin;
1135     }
1136     else
1137     {
1138     begin = string::npos;
1139     }
1140     }
1141    
1142     return value;
1143     }
1144    
1145     unsigned Ranker::phrase(vector<string>& words, unsigned word, unsigned& begin,
1146     bool start, string& where, map<unsigned, unsigned>& occurrences)
1147     {
1148     unsigned value = 0;
1149     bool end = !(word + 1 < words.size());
1150     unsigned found = where.find(words[word], begin);
1151     unsigned newBegin = found + words[word].length();
1152    
1153     if (found != string::npos)
1154     {
1155     bool isBefore, isAfter, before = false, after = false;
1156     isBefore = found - 1 > 0;
1157     isAfter = found + words[word].length() < where.length();
1158    
1159     if (isBefore) before = isalnum(where[found - 1]) != 0;
1160     if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1161    
1162     if (!before && !after)
1163     {
1164     bool between = true;
1165     if (!start)
1166     {
1167     for (unsigned index = begin + 1; index < found - 1; index++)
1168     {
1169     if (isalnum(where[index]))
1170     {
1171     between = false;
1172     break;
1173     }
1174     }
1175     }
1176    
1177     if (between)
1178     {
1179     occurrences.insert(pair<unsigned, unsigned>(found,
1180     words[word].length()));
1181    
1182     if (end)
1183     {
1184     begin = newBegin;
1185     value = 1;
1186     }
1187     else
1188     {
1189     value = phrase(words, (word + 1), newBegin, false, where,
1190     occurrences);
1191     }
1192     }
1193     }
1194     }
1195    
1196     if (start)
1197     {
1198     if (found != string::npos)
1199     {
1200     begin = newBegin;
1201     }
1202     else
1203     {
1204     begin = string::npos;
1205     }
1206     }
1207    
1208     return value;
1209     }
1210    
1211     unsigned Ranker::evaluate(vector<unsigned>& ins)
1212     {
1213     unsigned in = 0;
1214    
1215     for (unsigned index = 0; index < ins.size(); index++)
1216     {
1217     if (ins[index] > 0)
1218     {
1219     in += ins[index];
1220     }
1221     else
1222     {
1223     in = 0;
1224     break;
1225     }
1226     }
1227    
1228     return in;
1229     }
1230    
1231     void Ranker::decrap(string& crap)
1232     {
1233     unsigned begin = 0, found;
1234     do
1235     {
1236     // &, +, and # are not considered crap
1237     found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^_`{|}~", begin);
1238    
1239     if (found != string::npos)
1240     {
1241     crap[found] = ' ';
1242     }
1243    
1244     begin = found + 1;
1245     }
1246     while (found != string::npos && begin < crap.length());
1247    
1248     normalize(crap);
1249     }