ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 1
Committed: 2002-12-04T20:22:59-08:00 (22 years, 6 months ago) by douglas
File size: 28617 byte(s)
Log Message:
Initial revision

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // Ranker.cpp
50
51 #include "Ranker.h"
52
53 Ranker::Ranker()
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 Ranker::Ranker(Page& page) : Page(page)
63 {
64 value = 0;
65 requiredValue = 0;
66 excludedValue = 0;
67 eitherOrValue = 0;
68 allIn = all;
69 }
70
71 void Ranker::rank(vector<string> query)
72 {
73 vector<string> prep;
74
75 for (unsigned index = 0; index < query.size(); index++)
76 {
77 if (query[index] == "allintitle:" && index == 0)
78 {
79 allIn = title;
80 }
81 else if (query[index] == "allinurl:" && index == 0)
82 {
83 allIn = url;
84 }
85 else if (query[index] == "allintext:" && index == 0)
86 {
87 allIn = text;
88 }
89 else if (query[index].find("site:") == 0 && query[index].size() > 5)
90 {
91 site = query[index].substr(5);
92 }
93 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
94 {
95 prep.push_back("TITLE " + query[index].substr(8));
96 }
97 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
98 {
99 prep.push_back("URL " + query[index].substr(6));
100 }
101 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
102 {
103 prep.push_back("TEXT " + query[index].substr(7));
104 }
105 else
106 {
107 prep.push_back(query[index]);
108 }
109 }
110
111 if (prep.size() > 0)
112 {
113 bool or_ = false;
114 for (unsigned index = 0; index < prep.size(); index++)
115 {
116 bool exclude = false;
117 if (prep[index].find('+') == 0)
118 {
119 prep[index].erase(0, 1);
120 }
121 else if (prep[index].find('-') == 0)
122 {
123 exclude = true;
124 prep[index].erase(0, 1);
125 }
126
127 if (or_)
128 {
129 if (prep[index].find(" OR") == string::npos)
130 {
131 or_ = false;
132 }
133
134 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
135 }
136 else if (exclude)
137 {
138 excluded.push_back(prep[index]);
139 }
140 else if (prep[index].find(" OR") != string::npos)
141 {
142 or_ = true;
143 eitherOr.push_back(prep[index]);
144 }
145 else
146 {
147 required.push_back(prep[index]);
148 }
149 }
150 }
151
152 rank();
153 }
154
155 void Ranker::setSample()
156 {
157 map<unsigned, unsigned>::iterator itor;
158
159 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
160
161 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
162 {
163 unsigned distance;
164
165 if (++itor != occurrencesText.end())
166 {
167 unsigned next = itor->first;
168 itor--;
169
170 distance = next - (itor->first + itor->second);
171 }
172 else
173 {
174 distance = UINT_MAX;
175 itor--;
176 }
177
178 distances.insert(pair<unsigned, map<unsigned,
179 unsigned>::iterator>(distance, itor));
180 }
181
182 if (distances.begin() != distances.end())
183 {
184 itor = distances.begin()->second;
185 }
186
187 string portion;
188 unsigned sampleLength = 0, begin = 0, end = string::npos;
189 while (sampleLength < 160 && itor != occurrencesText.end())
190 {
191 unsigned found = itor->first;
192 unsigned length = itor->second;
193
194 for (unsigned index = found; index > begin; index--)
195 {
196 if (index == begin) cerr << "Oh crap, I'm insane!\n";
197 if (found - index >= 160 - sampleLength - length)
198 {
199 for (; index < found; index++)
200 {
201 if (isspace(getText()[index])) break;
202 }
203 begin = index + 1;
204 break;
205 }
206 else if ((index > begin ? (isupper(getText()[index]) &&
207 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
208 index != found)
209 {
210 begin = index;
211 break;
212 }
213 }
214
215 if (end + 1 != begin) sample += " <strong>...</strong> ";
216
217 portion = getText().substr(begin, found - begin);
218 sampleLength += portion.length();
219
220 entities(portion, '&', "&amp;");
221 entities(portion, '\"', "&quot;");
222 entities(portion, '<', "&lt;");
223 entities(portion, '>', "&gt;");
224
225 sample += portion + "<strong>";
226
227 portion = getText().substr(found, length);
228 sampleLength += portion.length();
229
230 entities(portion, '&', "&amp;");
231 entities(portion, '\"', "&quot;");
232 entities(portion, '<', "&lt;");
233 entities(portion, '>', "&gt;");
234
235 sample += portion + "</strong>";
236
237 begin = found + length;
238 end = begin - 1;
239
240 if (++itor != occurrencesText.end())
241 {
242 if (itor->first + itor->second < begin + 160 - sampleLength)
243 {
244 portion = getText().substr(begin, itor->first - begin);
245 sampleLength += portion.length();
246
247 entities(portion, '&', "&amp;");
248 entities(portion, '\"', "&quot;");
249 entities(portion, '<', "&lt;");
250 entities(portion, '>', "&gt;");
251
252 sample += portion;
253
254 begin = itor->first;
255 end = begin - 1;
256 }
257 else
258 {
259 for (end = begin + 160 - sampleLength; end > begin; end--)
260 {
261 if (isspace(getText()[end])) break;
262 }
263
264 portion = getText().substr(begin, end - begin + 1);
265 sampleLength += portion.length();
266
267 entities(portion, '&', "&amp;");
268 entities(portion, '\"', "&quot;");
269 entities(portion, '<', "&lt;");
270 entities(portion, '>', "&gt;");
271
272 sample += portion + " <strong>...</strong>";
273
274 break;
275 }
276 }
277 else
278 {
279 for (end = begin + 160 - sampleLength; end > begin && (end + 1 <
280 getText().length()); end--)
281 {
282 if (isspace(getText()[end])) break;
283 }
284
285 if (end >= getText().length()) end = getText().length() - 1;
286
287 portion = getText().substr(begin, end - begin + 1);
288 sampleLength += portion.length();
289
290 entities(portion, '&', "&amp;");
291 entities(portion, '\"', "&quot;");
292 entities(portion, '<', "&lt;");
293 entities(portion, '>', "&gt;");
294
295 sample += portion;
296
297 if (end + 1 < getText().length())
298 {
299 sample += " <strong>...</strong>";
300 }
301
302 break;
303 }
304 }
305 }
306
307 string Ranker::getTitle()
308 {
309 string title, portion;
310
311 unsigned begin = 0;
312 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
313 itor != occurrencesTitle.end(); itor++)
314 {
315 unsigned found = itor->first;
316 unsigned length = itor->second;
317
318 portion = Page::getTitle().substr(begin, found - begin);
319
320 entities(portion, '&', "&amp;");
321 entities(portion, '\"', "&quot;");
322 entities(portion, '<', "&lt;");
323 entities(portion, '>', "&gt;");
324
325 title += portion + "<strong>";
326
327 portion = Page::getTitle().substr(found, length);
328
329 entities(portion, '&', "&amp;");
330 entities(portion, '\"', "&quot;");
331 entities(portion, '<', "&lt;");
332 entities(portion, '>', "&gt;");
333
334 title += portion + "</strong>";
335
336 begin = found + length;
337 }
338
339 portion = Page::getTitle().substr(begin);
340
341 entities(portion, '&', "&amp;");
342 entities(portion, '\"', "&quot;");
343 entities(portion, '<', "&lt;");
344 entities(portion, '>', "&gt;");
345
346 title += portion;
347
348 return title;
349 }
350
351 string Ranker::getDescription()
352 {
353 string description, portion;
354
355 unsigned begin = 0;
356 for (map<unsigned, unsigned>::iterator itor =
357 occurrencesDescription.begin(); itor != occurrencesDescription.end();
358 itor++)
359 {
360 unsigned found = itor->first;
361 unsigned length = itor->second;
362
363 portion = Page::getDescription().substr(begin, found - begin);
364
365 entities(portion, '&', "&amp;");
366 entities(portion, '\"', "&quot;");
367 entities(portion, '<', "&lt;");
368 entities(portion, '>', "&gt;");
369
370 description += portion + "<strong>";
371
372 portion = Page::getDescription().substr(found, length);
373
374 entities(portion, '&', "&amp;");
375 entities(portion, '\"', "&quot;");
376 entities(portion, '<', "&lt;");
377 entities(portion, '>', "&gt;");
378
379 description += portion + "</strong>";
380
381 begin = found + length;
382 }
383
384 portion = Page::getDescription().substr(begin);
385
386 entities(portion, '&', "&amp;");
387 entities(portion, '\"', "&quot;");
388 entities(portion, '<', "&lt;");
389 entities(portion, '>', "&gt;");
390
391 description += portion;
392
393 return description;
394 }
395
396 bool Ranker::operator==(const unsigned number) const
397 {
398 return value == number;
399 }
400
401 bool Ranker::operator==(const Ranker& ranker) const
402 {
403 return value == ranker.value;
404 }
405
406 bool Ranker::operator!=(const unsigned number) const
407 {
408 return value != number;
409 }
410
411 bool Ranker::operator!=(const Ranker& ranker) const
412 {
413 return value != ranker.value;
414 }
415
416 bool Ranker::operator<(const unsigned number) const
417 {
418 return value < number;
419 }
420
421 bool Ranker::operator<(const Ranker& ranker) const
422 {
423 return value < ranker.value;
424 }
425
426 bool Ranker::operator>(const unsigned number) const
427 {
428 return value > number;
429 }
430
431 bool Ranker::operator >(const Ranker& ranker) const
432 {
433 return value > ranker.value;
434 }
435
436 void Ranker::rank()
437 {
438 lowerAddress = string(getAddress().length(), ' ');
439 for (unsigned index = 0; index < lowerAddress.length(); index++)
440 {
441 lowerAddress[index] = tolower(getAddress()[index]);
442 }
443
444 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
445 site.length())
446 {
447 bool isRequired = required.size() > 0;
448 bool isExcluded = excluded.size() > 0;
449 bool isEitherOr = eitherOr.size() > 0;
450
451 lowerURL = string(getURL().length(), ' ');
452 for (unsigned index = 0; index < lowerURL.length(); index++)
453 {
454 lowerURL[index] = tolower(getURL()[index]);
455 }
456
457 lowerTitle = string(Page::getTitle().length(), ' ');
458 for (unsigned index0 = 0; index0 < lowerTitle.length(); index0++)
459 {
460 lowerTitle[index0] = tolower(Page::getTitle()[index0]);
461 }
462
463 lowerText = string(Page::getText().length(), ' ');
464 for (unsigned index1 = 0; index1 < lowerText.length(); index1++)
465 {
466 lowerText[index1] = tolower(Page::getText()[index1]);
467 }
468
469 if (isRequired) checkRequired();
470 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
471 if (isEitherOr) checkEitherOr();
472
473 if (isRequired && isExcluded && isEitherOr)
474 {
475 value += requiredValue && !excludedValue && eitherOrValue ?
476 requiredValue + eitherOrValue : 0;
477 }
478 else if (isRequired && isExcluded)
479 {
480 value += requiredValue && !excludedValue ? requiredValue : 0;
481 }
482 else if (isRequired && isEitherOr)
483 {
484 value += requiredValue && eitherOrValue ? requiredValue +
485 eitherOrValue : 0;
486 }
487 else if (isExcluded && isEitherOr)
488 {
489 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
490 }
491 else if (isRequired)
492 {
493 value += requiredValue;
494 }
495 else if (isEitherOr)
496 {
497 value += eitherOrValue;
498 }
499 else
500 {
501 // do nothing this is a bad search and warrants no results
502 }
503
504 if (value > 0)
505 {
506 string lowerDescription = string(Page::getDescription().length(),
507 ' ');
508 for (unsigned index = 0; index < lowerDescription.length(); index++)
509 {
510 lowerDescription[index] = tolower(
511 Page::getDescription()[index]);
512 }
513
514 for (unsigned index0 = 0; index0 < required.size(); index0++)
515 {
516 if (required[index0].find("URL ") == 0)
517 {
518 string fred = required[index0].substr(4);
519 value += find(fred, lowerDescription,
520 occurrencesDescription);
521 }
522 else if (required[index0].find("TITLE ") == 0)
523 {
524 string fred = required[index0].substr(6);
525 value += find(fred, lowerDescription,
526 occurrencesDescription);
527 }
528 else if (required[index0].find("TEXT ") == 0)
529 {
530 string fred = required[index0].substr(5);
531 value += find(fred, lowerDescription,
532 occurrencesDescription);
533 }
534 else
535 {
536 value += find(required[index0], lowerDescription,
537 occurrencesDescription);
538 }
539 }
540
541 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
542 {
543 vector<string> words;
544
545 unsigned begin = 0, found;
546 do
547 {
548 found = eitherOr[index1].find(" OR ", begin);
549
550 if (found != string::npos)
551 {
552 words.push_back(eitherOr[index1].substr(begin, found -
553 begin));
554 }
555 else
556 {
557 words.push_back(eitherOr[index1].substr(begin));
558 }
559
560 begin = found + 4;
561 }
562 while (begin < eitherOr[index1].length() && found !=
563 string::npos);
564
565 for (unsigned number = 0; number < words.size(); number++)
566 {
567 if (words[index1].find("URL ") == 0)
568 {
569 string fred = words[index1].substr(4);
570 value += find(fred, lowerDescription,
571 occurrencesDescription);
572 }
573 else if (words[index1].find("TITLE ") == 0)
574 {
575 string fred = words[index1].substr(6);
576 value += find(fred, lowerDescription,
577 occurrencesDescription);
578 }
579 else if (words[index1].find("TEXT ") == 0)
580 {
581 string fred = words[index1].substr(5);
582 value += find(fred, lowerDescription,
583 occurrencesDescription);
584 }
585 else
586 {
587 value += find(words[index1], lowerDescription,
588 occurrencesDescription);
589 }
590 }
591 }
592
593 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
594 {
595 string lowerHeading = string(getHeadings()[index2].length(),
596 ' ');
597 for (unsigned number = 0; number <
598 getHeadings()[index2].length(); number++)
599 {
600 lowerHeading[number] = tolower(
601 getHeadings()[index2][number]);
602 }
603
604 for (unsigned number0 = 0; number0 < required.size(); number0++)
605 {
606 if (required[number0].find("URL ") == 0)
607 {
608 string fred = required[number0].substr(4);
609 value += find(fred,
610 lowerHeading);
611 }
612 else if (required[number0].find("TITLE ") == 0)
613 {
614 string fred = required[number0].substr(6);
615 value += find(fred,
616 lowerHeading);
617 }
618 else if (required[number0].find("TEXT ") == 0)
619 {
620 string fred = required[number0].substr(5);
621 value += find(fred,
622 lowerHeading);
623 }
624 else
625 {
626 value += find(required[number0], lowerHeading);
627 }
628 }
629
630 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
631 {
632 vector<string> words;
633
634 unsigned begin = 0, found;
635 do
636 {
637 found = eitherOr[number1].find(" OR ", begin);
638
639 if (found != string::npos)
640 {
641 words.push_back(eitherOr[number1].substr(begin,
642 found - begin));
643 }
644 else
645 {
646 words.push_back(eitherOr[number1].substr(begin));
647 }
648
649 begin = found + 4;
650 }
651 while (begin < eitherOr[number1].length() && found !=
652 string::npos);
653
654 for (unsigned number = 0; number < words.size(); number++)
655 {
656 if (words[number].find("URL ") == 0)
657 {
658 string fred = words[number].substr(4);
659 value += find(fred,
660 lowerHeading);
661 }
662 else if (words[number].find("TITLE ") == 0)
663 {
664 string fred = words[number].substr(6);
665 value += find(fred,
666 lowerHeading);
667 }
668 else if (words[number].find("TEXT ") == 0)
669 {
670 string fred = words[number].substr(5);
671 value += find(fred,
672 lowerHeading);
673 }
674 else
675 {
676 value += find(words[number], lowerHeading);
677 }
678 }
679 }
680 }
681 }
682 }
683 }
684
685 void Ranker::checkRequired()
686 {
687 vector<unsigned> inURLs, inTitles, inTexts;
688
689 for (unsigned index = 0; index < required.size(); index++)
690 {
691 unsigned inURL = 0, inTitle = 0, inText = 0;
692
693 if (required[index].find("URL ") == 0)
694 {
695 string fred = required[index].substr(4);
696 string martha = lowerURL.substr(7);
697 inURL = find(fred, martha);
698
699 if (inURL)
700 {
701 string fred = required[index].substr(4);
702 inTitle = find(fred, lowerTitle,
703 occurrencesTitle);
704 string martha = required[index].substr(4);
705 inText = find(martha, lowerText,
706 occurrencesText);
707
708 if (!inTitle) inTitle++;
709 if (!inText) inText++;
710 }
711 }
712 else if (required[index].find("TITLE ") == 0)
713 {
714 string fred = required[index].substr(6);
715 inTitle = find(fred, lowerTitle,
716 occurrencesTitle);
717
718 if (inTitle)
719 {
720 string fred = required[index].substr(6);
721 string martha = lowerURL.substr(7);
722 inURL = find(fred, martha);
723 string george = required[index].substr(6);
724 inText = find(george, lowerText,
725 occurrencesText);
726
727 if (!inURL) inURL++;
728 if (!inText) inText++;
729 }
730 }
731 else if (required[index].find("TEXT ") == 0)
732 {
733 string fred = required[index].substr(5);
734 inText = find(fred, lowerText,
735 occurrencesText);
736
737 if (inText)
738 {
739 string fred = required[index].substr(5);
740 string martha = lowerURL.substr(7);
741 inURL = find(fred, martha);
742 string george = required[index].substr(5);
743 inTitle = find(george, lowerTitle,
744 occurrencesTitle);
745
746 if (!inURL) inURL++;
747 if (!inTitle) inTitle++;
748 }
749 }
750 else
751 {
752 string fred = lowerURL.substr(7);
753 inURL = find(required[index], fred);
754 inTitle = find(required[index], lowerTitle, occurrencesTitle);
755 inText = find(required[index], lowerText, occurrencesText);
756 }
757
758 inURLs.push_back(inURL);
759 inTitles.push_back(inTitle);
760 inTexts.push_back(inText);
761 }
762
763 unsigned inURL = evaluate(inURLs);
764 unsigned inTitle = evaluate(inTitles);
765 unsigned inText = evaluate(inTexts);
766
767 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
768 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
769 inText : 0;
770 }
771
772 void Ranker::checkExcluded()
773 {
774 vector<unsigned> inURLs, inTitles, inTexts;
775
776 for (unsigned index = 0; index < excluded.size(); index++)
777 {
778 unsigned inURL = 0, inTitle = 0, inText = 0;
779
780 string fred = lowerURL.substr(7);
781 inURL = find(excluded[index], fred);
782 inTitle = find(excluded[index], lowerTitle);
783 inText = find(excluded[index], lowerText);
784
785 inURLs.push_back(inURL);
786 inTitles.push_back(inTitle);
787 inTexts.push_back(inText);
788 }
789
790 unsigned inURL = evaluate(inURLs);
791 unsigned inTitle = evaluate(inTitles);
792 unsigned inText = evaluate(inTexts);
793
794 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
795 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
796 inText : 0;
797 }
798
799 void Ranker::checkEitherOr()
800 {
801 vector<unsigned> inURLs, inTitles, inTexts;
802
803 for (unsigned index = 0; index < eitherOr.size(); index++)
804 {
805 vector<unsigned> inURLz, inTitlez, inTextz;
806 unsigned inURL = 0, inTitle = 0, inText = 0;
807 vector<string> words;
808
809 unsigned begin = 0, found;
810 do
811 {
812 found = eitherOr[index].find(" OR ", begin);
813
814 if (found != string::npos)
815 {
816 words.push_back(eitherOr[index].substr(begin, found - begin));
817 }
818 else
819 {
820 words.push_back(eitherOr[index].substr(begin));
821 }
822
823 begin = found + 4;
824 }
825 while (begin < eitherOr[index].length() && found != string::npos);
826
827 for (unsigned number = 0; number < words.size(); number++)
828 {
829 unsigned inURL = 0, inTitle = 0, inText = 0;
830
831 if (words[number].find("URL ") == 0)
832 {
833 string fred = words[number].substr(4);
834 string martha = lowerURL.substr(7);
835 inURL = find(fred, martha);
836
837 if (inURL)
838 {
839 string fred = words[number].substr(4);
840 inTitle = find(fred, lowerTitle,
841 occurrencesTitle);
842 string martha = words[number].substr(4);
843 inText = find(martha, lowerText,
844 occurrencesText);
845
846 if (!inTitle) inTitle++;
847 if (!inText) inText++;
848 }
849 }
850 else if (words[number].find("TITLE ") == 0)
851 {
852 string fred = words[number].substr(6);
853 inTitle = find(fred, lowerTitle,
854 occurrencesTitle);
855
856 if (inTitle)
857 {
858 string fred = words[number].substr(6);
859 string martha = lowerURL.substr(7);
860 inURL = find(fred, martha);
861 string george = words[number].substr(6);
862 inText = find(george, lowerText,
863 occurrencesText);
864
865 if (!inURL) inURL++;
866 if (!inText) inText++;
867 }
868 }
869 else if (words[number].find("TEXT ") == 0)
870 {
871 string fred = words[number].substr(5);
872 inText = find(fred, lowerText,
873 occurrencesText);
874
875 if (inText)
876 {
877 string fred = words[number].substr(5);
878 string martha = lowerURL.substr(7);
879 inURL = find(fred, martha);
880 string george = words[number].substr(5);
881 inTitle = find(george, lowerTitle,
882 occurrencesTitle);
883
884 if (!inURL) inURL++;
885 if (!inTitle) inTitle++;
886 }
887 }
888 else
889 {
890 string fred = lowerURL.substr(7);
891 inURL = find(words[number], fred);
892 inTitle = find(words[number], lowerTitle, occurrencesTitle);
893 inText = find(words[number], lowerText, occurrencesText);
894 }
895
896 inURLz.push_back(inURL);
897 inTitlez.push_back(inTitle);
898 inTextz.push_back(inText);
899 }
900
901 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
902 {
903 inURL += inURLz[number0];
904 }
905
906 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
907 {
908 inTitle += inTitlez[number1];
909 }
910
911 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
912 {
913 inText += inTextz[number2];
914 }
915
916 inURLs.push_back(inURL);
917 inTitles.push_back(inTitle);
918 inTexts.push_back(inText);
919
920 inURLz.clear();
921 inTitlez.clear();
922 inTextz.clear();
923 words.clear();
924 }
925
926 unsigned inURL = evaluate(inURLs);
927 unsigned inTitle = evaluate(inTitles);
928 unsigned inText = evaluate(inTexts);
929
930 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
931 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
932 inText : 0;
933 }
934
935 unsigned Ranker::find(string& word, string& where)
936 {
937 unsigned value = 0;
938
939 decrap(word);
940
941 if (word == "")
942 {
943 // this can happen if a word is all crap characters
944 value++;
945 }
946 else if (word.find_first_of(" \n ") == string::npos)
947 {
948 unsigned begin = 0, found;
949 do
950 {
951 found = where.find(word, begin);
952
953 if (found != string::npos)
954 {
955 bool isBefore, isAfter, before = false, after = false;
956 isBefore = found - 1 > 0;
957 isAfter = found + word.length() < where.length();
958
959 if (isBefore) before = isalnum(where[found - 1]) != 0;
960 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
961
962 if (!before && !after)
963 {
964 value++;
965 }
966 }
967
968 begin = found + word.length();
969 }
970 while (found != string::npos && begin < where.length());
971 }
972 else
973 {
974 value = phrase(word, where);
975 }
976
977 return value;
978 }
979
980 unsigned Ranker::find(string& word, string& where, map<unsigned, unsigned>&
981 occurrences)
982 {
983 unsigned value = 0;
984
985 decrap(word);
986
987 if (word == "")
988 {
989 // this can happen if a word is all crap characters
990 value++;
991 }
992 else if (word.find_first_of(" \n ") == string::npos)
993 {
994 unsigned begin = 0, found;
995 do
996 {
997 found = where.find(word, begin);
998
999 if (found != string::npos)
1000 {
1001 bool isBefore, isAfter, before = false, after = false;
1002 isBefore = found - 1 > 0;
1003 isAfter = found + word.length() < where.length();
1004
1005 if (isBefore) before = isalnum(where[found - 1]) != 0;
1006 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
1007
1008 if (!before && !after)
1009 {
1010 value++;
1011
1012 occurrences.insert(pair<unsigned, unsigned>(found,
1013 word.length()));
1014 }
1015 }
1016
1017 begin = found + word.length();
1018 }
1019 while (found != string::npos && begin < where.length());
1020 }
1021 else
1022 {
1023 value = phrase(word, where, occurrences);
1024 }
1025
1026 return value;
1027 }
1028
1029 unsigned Ranker::phrase(string& phrase, string& where)
1030 {
1031 unsigned value = 0;
1032 vector<string> words;
1033
1034 unsigned begin = 0, space;
1035 do
1036 {
1037 space = phrase.find(' ', begin);
1038
1039 words.push_back(phrase.substr(begin, space - begin));
1040
1041 begin = space + 1;
1042 }
1043 while (space != string::npos && begin < phrase.length());
1044
1045 begin = 0;
1046 unsigned counter = 0;
1047 do
1048 {
1049 value += this->phrase(words, 0, begin, true, where);
1050 }
1051 while (begin < where.length());
1052
1053 return value;
1054 }
1055
1056 unsigned Ranker::phrase(string& phrase, string& where, map<unsigned, unsigned>&
1057 occurrences)
1058 {
1059 unsigned value = 0;
1060 vector<string> words;
1061
1062 unsigned begin = 0, space;
1063 do
1064 {
1065 space = phrase.find(' ', begin);
1066
1067 words.push_back(phrase.substr(begin, space - begin));
1068
1069 begin = space + 1;
1070 }
1071 while (space != string::npos && begin < phrase.length());
1072
1073 begin = 0;
1074 do
1075 {
1076 value += this->phrase(words, 0, begin, true, where, occurrences);
1077 }
1078 while (begin < where.length());
1079
1080 return value;
1081 }
1082
1083 unsigned Ranker::phrase(vector<string>& words, unsigned word, unsigned& begin,
1084 bool start, string& where)
1085 {
1086 unsigned value = 0;
1087 bool end = !(word + 1 < words.size());
1088 unsigned found = where.find(words[word], begin);
1089 unsigned newBegin = found + words[word].length();
1090
1091 if (found != string::npos)
1092 {
1093 bool isBefore, isAfter, before = false, after = false;
1094 isBefore = found - 1 > 0;
1095 isAfter = found + words[word].length() < where.length();
1096
1097 if (isBefore) before = isalnum(where[found - 1]) != 0;
1098 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1099
1100 if (!before && !after)
1101 {
1102 bool between = true;
1103 if (!start)
1104 {
1105 for (unsigned index = begin + 1; index < found - 1; index++)
1106 {
1107 if (isalnum(where[index]))
1108 {
1109 between = false;
1110 break;
1111 }
1112 }
1113 }
1114
1115 if (between)
1116 {
1117 if (end)
1118 {
1119 begin = newBegin;
1120 value = 1;
1121 }
1122 else
1123 {
1124 value = phrase(words, (word + 1), newBegin, false, where);
1125 }
1126 }
1127 }
1128 }
1129
1130 if (start)
1131 {
1132 if (found != string::npos)
1133 {
1134 begin = newBegin;
1135 }
1136 else
1137 {
1138 begin = string::npos;
1139 }
1140 }
1141
1142 return value;
1143 }
1144
1145 unsigned Ranker::phrase(vector<string>& words, unsigned word, unsigned& begin,
1146 bool start, string& where, map<unsigned, unsigned>& occurrences)
1147 {
1148 unsigned value = 0;
1149 bool end = !(word + 1 < words.size());
1150 unsigned found = where.find(words[word], begin);
1151 unsigned newBegin = found + words[word].length();
1152
1153 if (found != string::npos)
1154 {
1155 bool isBefore, isAfter, before = false, after = false;
1156 isBefore = found - 1 > 0;
1157 isAfter = found + words[word].length() < where.length();
1158
1159 if (isBefore) before = isalnum(where[found - 1]) != 0;
1160 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1161
1162 if (!before && !after)
1163 {
1164 bool between = true;
1165 if (!start)
1166 {
1167 for (unsigned index = begin + 1; index < found - 1; index++)
1168 {
1169 if (isalnum(where[index]))
1170 {
1171 between = false;
1172 break;
1173 }
1174 }
1175 }
1176
1177 if (between)
1178 {
1179 occurrences.insert(pair<unsigned, unsigned>(found,
1180 words[word].length()));
1181
1182 if (end)
1183 {
1184 begin = newBegin;
1185 value = 1;
1186 }
1187 else
1188 {
1189 value = phrase(words, (word + 1), newBegin, false, where,
1190 occurrences);
1191 }
1192 }
1193 }
1194 }
1195
1196 if (start)
1197 {
1198 if (found != string::npos)
1199 {
1200 begin = newBegin;
1201 }
1202 else
1203 {
1204 begin = string::npos;
1205 }
1206 }
1207
1208 return value;
1209 }
1210
1211 unsigned Ranker::evaluate(vector<unsigned>& ins)
1212 {
1213 unsigned in = 0;
1214
1215 for (unsigned index = 0; index < ins.size(); index++)
1216 {
1217 if (ins[index] > 0)
1218 {
1219 in += ins[index];
1220 }
1221 else
1222 {
1223 in = 0;
1224 break;
1225 }
1226 }
1227
1228 return in;
1229 }
1230
1231 void Ranker::decrap(string& crap)
1232 {
1233 unsigned begin = 0, found;
1234 do
1235 {
1236 // &, +, and # are not considered crap
1237 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^_`{|}~", begin);
1238
1239 if (found != string::npos)
1240 {
1241 crap[found] = ' ';
1242 }
1243
1244 begin = found + 1;
1245 }
1246 while (found != string::npos && begin < crap.length());
1247
1248 normalize(crap);
1249 }