ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 46
Committed: 2003-01-30T18:51:01-08:00 (22 years, 4 months ago) by douglas
File size: 27674 byte(s)
Log Message:
Changed '_' handling by Ranker.decrap().

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // Ranker.cpp
50
51 #include "Ranker.h"
52
53 Ranker::Ranker()
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 Ranker::Ranker(Page& page) : Page(page)
63 {
64 value = 0;
65 requiredValue = 0;
66 excludedValue = 0;
67 eitherOrValue = 0;
68 allIn = all;
69 }
70
71 void Ranker::rank(vector<string> query)
72 {
73 vector<string> prep;
74
75 for (unsigned index = 0; index < query.size(); index++)
76 {
77 if (query[index] == "allintitle:" && index == 0)
78 {
79 allIn = title;
80 }
81 else if (query[index] == "allinurl:" && index == 0)
82 {
83 allIn = url;
84 }
85 else if (query[index] == "allintext:" && index == 0)
86 {
87 allIn = text;
88 }
89 else if (query[index].find("site:") == 0 && query[index].size() > 5)
90 {
91 site = query[index].substr(5);
92 }
93 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
94 {
95 prep.push_back("TITLE " + query[index].substr(8));
96 }
97 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
98 {
99 prep.push_back("URL " + query[index].substr(6));
100 }
101 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
102 {
103 prep.push_back("TEXT " + query[index].substr(7));
104 }
105 else
106 {
107 prep.push_back(query[index]);
108 }
109 }
110
111 if (prep.size() > 0)
112 {
113 bool or_ = false;
114 for (unsigned index = 0; index < prep.size(); index++)
115 {
116 bool exclude = false;
117 if (prep[index].find('+') == 0)
118 {
119 prep[index].erase(0, 1);
120 }
121 else if (prep[index].find('-') == 0)
122 {
123 exclude = true;
124 prep[index].erase(0, 1);
125 }
126
127 if (or_)
128 {
129 if (prep[index].find(" OR") == string::npos)
130 {
131 or_ = false;
132 }
133
134 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
135 }
136 else if (exclude)
137 {
138 excluded.push_back(prep[index]);
139 }
140 else if (prep[index].find(" OR") != string::npos)
141 {
142 or_ = true;
143 eitherOr.push_back(prep[index]);
144 }
145 else
146 {
147 required.push_back(prep[index]);
148 }
149 }
150 }
151
152 rank();
153 }
154
155 void Ranker::setSample()
156 {
157 map<unsigned, unsigned>::iterator itor;
158
159 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
160
161 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
162 {
163 unsigned distance;
164
165 if (++itor != occurrencesText.end())
166 {
167 unsigned next = itor->first;
168 itor--;
169
170 distance = next - (itor->first + itor->second);
171 }
172 else
173 {
174 distance = UINT_MAX;
175 itor--;
176 }
177
178 distances.insert(pair<unsigned, map<unsigned,
179 unsigned>::iterator>(distance, itor));
180 }
181
182 if (distances.begin() != distances.end())
183 {
184 itor = distances.begin()->second;
185 }
186
187 string portion;
188 unsigned sampleLength = 0, begin = 0, end = string::npos;
189 while (sampleLength < 160 && itor != occurrencesText.end())
190 {
191 unsigned found = itor->first;
192 unsigned length = itor->second;
193
194 for (unsigned index = found; index > begin; index--)
195 {
196 if (index == begin) cerr << "Oh crap, I'm insane!\n";
197 if (found - index >= 160 - sampleLength - length)
198 {
199 for (; index < found; index++)
200 {
201 if (isspace(getText()[index])) break;
202 }
203 begin = index + 1;
204 break;
205 }
206 else if ((index > begin ? (isupper(getText()[index]) &&
207 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
208 index != found)
209 {
210 begin = index;
211 break;
212 }
213 }
214
215 if (end + 1 != begin) sample += " <strong>...</strong> ";
216
217 portion = getText().substr(begin, found - begin);
218 sampleLength += portion.length();
219
220 entities(portion, '&', "&amp;");
221 entities(portion, '\"', "&quot;");
222 entities(portion, '<', "&lt;");
223 entities(portion, '>', "&gt;");
224
225 sample += portion + "<strong>";
226
227 portion = getText().substr(found, length);
228 sampleLength += portion.length();
229
230 entities(portion, '&', "&amp;");
231 entities(portion, '\"', "&quot;");
232 entities(portion, '<', "&lt;");
233 entities(portion, '>', "&gt;");
234
235 sample += portion + "</strong>";
236
237 begin = found + length;
238 end = begin - 1;
239
240 if (++itor != occurrencesText.end())
241 {
242 if (itor->first + itor->second < begin + 160 - sampleLength)
243 {
244 portion = getText().substr(begin, itor->first - begin);
245 sampleLength += portion.length();
246
247 entities(portion, '&', "&amp;");
248 entities(portion, '\"', "&quot;");
249 entities(portion, '<', "&lt;");
250 entities(portion, '>', "&gt;");
251
252 sample += portion;
253
254 begin = itor->first;
255 end = begin - 1;
256 }
257 else
258 {
259 for (end = begin + 160 - sampleLength; end > begin; end--)
260 {
261 if (isspace(getText()[end])) break;
262 }
263
264 portion = getText().substr(begin, end - begin + 1);
265 sampleLength += portion.length();
266
267 entities(portion, '&', "&amp;");
268 entities(portion, '\"', "&quot;");
269 entities(portion, '<', "&lt;");
270 entities(portion, '>', "&gt;");
271
272 sample += portion + " <strong>...</strong>";
273
274 break;
275 }
276 }
277 else
278 {
279 for (end = begin + 160 - sampleLength; end > begin && (end + 1 <
280 getText().length()); end--)
281 {
282 if (isspace(getText()[end])) break;
283 }
284
285 if (end >= getText().length()) end = getText().length() - 1;
286
287 portion = getText().substr(begin, end - begin + 1);
288 sampleLength += portion.length();
289
290 entities(portion, '&', "&amp;");
291 entities(portion, '\"', "&quot;");
292 entities(portion, '<', "&lt;");
293 entities(portion, '>', "&gt;");
294
295 sample += portion;
296
297 if (end + 1 < getText().length())
298 {
299 sample += " <strong>...</strong>";
300 }
301
302 break;
303 }
304 }
305 }
306
307 string Ranker::getTitle()
308 {
309 string title, portion;
310
311 unsigned begin = 0;
312 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
313 itor != occurrencesTitle.end(); itor++)
314 {
315 unsigned found = itor->first;
316 unsigned length = itor->second;
317
318 portion = Page::getTitle().substr(begin, found - begin);
319
320 entities(portion, '&', "&amp;");
321 entities(portion, '\"', "&quot;");
322 entities(portion, '<', "&lt;");
323 entities(portion, '>', "&gt;");
324
325 title += portion + "<strong>";
326
327 portion = Page::getTitle().substr(found, length);
328
329 entities(portion, '&', "&amp;");
330 entities(portion, '\"', "&quot;");
331 entities(portion, '<', "&lt;");
332 entities(portion, '>', "&gt;");
333
334 title += portion + "</strong>";
335
336 begin = found + length;
337 }
338
339 portion = Page::getTitle().substr(begin);
340
341 entities(portion, '&', "&amp;");
342 entities(portion, '\"', "&quot;");
343 entities(portion, '<', "&lt;");
344 entities(portion, '>', "&gt;");
345
346 title += portion;
347
348 return title;
349 }
350
351 string Ranker::getDescription()
352 {
353 string description, portion;
354
355 unsigned begin = 0;
356 for (map<unsigned, unsigned>::iterator itor =
357 occurrencesDescription.begin(); itor != occurrencesDescription.end();
358 itor++)
359 {
360 unsigned found = itor->first;
361 unsigned length = itor->second;
362
363 portion = Page::getDescription().substr(begin, found - begin);
364
365 entities(portion, '&', "&amp;");
366 entities(portion, '\"', "&quot;");
367 entities(portion, '<', "&lt;");
368 entities(portion, '>', "&gt;");
369
370 description += portion + "<strong>";
371
372 portion = Page::getDescription().substr(found, length);
373
374 entities(portion, '&', "&amp;");
375 entities(portion, '\"', "&quot;");
376 entities(portion, '<', "&lt;");
377 entities(portion, '>', "&gt;");
378
379 description += portion + "</strong>";
380
381 begin = found + length;
382 }
383
384 portion = Page::getDescription().substr(begin);
385
386 entities(portion, '&', "&amp;");
387 entities(portion, '\"', "&quot;");
388 entities(portion, '<', "&lt;");
389 entities(portion, '>', "&gt;");
390
391 description += portion;
392
393 return description;
394 }
395
396 bool Ranker::operator==(const unsigned number) const
397 {
398 return value == number;
399 }
400
401 bool Ranker::operator==(const Ranker& ranker) const
402 {
403 return value == ranker.value;
404 }
405
406 bool Ranker::operator!=(const unsigned number) const
407 {
408 return value != number;
409 }
410
411 bool Ranker::operator!=(const Ranker& ranker) const
412 {
413 return value != ranker.value;
414 }
415
416 bool Ranker::operator<(const unsigned number) const
417 {
418 return value < number;
419 }
420
421 bool Ranker::operator<(const Ranker& ranker) const
422 {
423 return value < ranker.value;
424 }
425
426 bool Ranker::operator>(const unsigned number) const
427 {
428 return value > number;
429 }
430
431 bool Ranker::operator >(const Ranker& ranker) const
432 {
433 return value > ranker.value;
434 }
435
436 void Ranker::rank()
437 {
438 lowerAddress = string(getAddress().length(), ' ');
439 for (unsigned index = 0; index < lowerAddress.length(); index++)
440 {
441 lowerAddress[index] = tolower(getAddress()[index]);
442 }
443
444 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
445 site.length())
446 {
447 bool isRequired = required.size() > 0;
448 bool isExcluded = excluded.size() > 0;
449 bool isEitherOr = eitherOr.size() > 0;
450
451 lowerURL = string(getURL().length(), ' ');
452 for (unsigned index = 0; index < lowerURL.length(); index++)
453 {
454 lowerURL[index] = tolower(getURL()[index]);
455 }
456
457 lowerTitle = string(Page::getTitle().length(), ' ');
458 for (unsigned index0 = 0; index0 < lowerTitle.length(); index0++)
459 {
460 lowerTitle[index0] = tolower(Page::getTitle()[index0]);
461 }
462
463 lowerText = string(Page::getText().length(), ' ');
464 for (unsigned index1 = 0; index1 < lowerText.length(); index1++)
465 {
466 lowerText[index1] = tolower(Page::getText()[index1]);
467 }
468
469 if (isRequired) checkRequired();
470 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
471 if (isEitherOr) checkEitherOr();
472
473 if (isRequired && isExcluded && isEitherOr)
474 {
475 value += requiredValue && !excludedValue && eitherOrValue ?
476 requiredValue + eitherOrValue : 0;
477 }
478 else if (isRequired && isExcluded)
479 {
480 value += requiredValue && !excludedValue ? requiredValue : 0;
481 }
482 else if (isRequired && isEitherOr)
483 {
484 value += requiredValue && eitherOrValue ? requiredValue +
485 eitherOrValue : 0;
486 }
487 else if (isExcluded && isEitherOr)
488 {
489 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
490 }
491 else if (isRequired)
492 {
493 value += requiredValue;
494 }
495 else if (isEitherOr)
496 {
497 value += eitherOrValue;
498 }
499 else
500 {
501 // do nothing this is a bad search and warrants no results
502 }
503
504 if (value > 0)
505 {
506 string lowerDescription = string(Page::getDescription().length(),
507 ' ');
508 for (unsigned index = 0; index < lowerDescription.length(); index++)
509 {
510 lowerDescription[index] = tolower(
511 Page::getDescription()[index]);
512 }
513
514 for (unsigned index0 = 0; index0 < required.size(); index0++)
515 {
516 if (required[index0].find("URL ") == 0)
517 {
518 value += find(required[index0].substr(4), lowerDescription,
519 occurrencesDescription);
520 }
521 else if (required[index0].find("TITLE ") == 0)
522 {
523 value += find(required[index0].substr(6), lowerDescription,
524 occurrencesDescription);
525 }
526 else if (required[index0].find("TEXT ") == 0)
527 {
528 value += find(required[index0].substr(5), lowerDescription,
529 occurrencesDescription);
530 }
531 else
532 {
533 value += find(required[index0], lowerDescription,
534 occurrencesDescription);
535 }
536 }
537
538 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
539 {
540 vector<string> words;
541
542 unsigned begin = 0, found;
543 do
544 {
545 found = eitherOr[index1].find(" OR ", begin);
546
547 if (found != string::npos)
548 {
549 words.push_back(eitherOr[index1].substr(begin, found -
550 begin));
551 }
552 else
553 {
554 words.push_back(eitherOr[index1].substr(begin));
555 }
556
557 begin = found + 4;
558 }
559 while (begin < eitherOr[index1].length() && found !=
560 string::npos);
561
562 for (unsigned number = 0; number < words.size(); number++)
563 {
564 if (words[index1].find("URL ") == 0)
565 {
566 value += find(words[index1].substr(4),
567 lowerDescription, occurrencesDescription);
568 }
569 else if (words[index1].find("TITLE ") == 0)
570 {
571 value += find(words[index1].substr(6),
572 lowerDescription, occurrencesDescription);
573 }
574 else if (words[index1].find("TEXT ") == 0)
575 {
576 value += find(words[index1].substr(5),
577 lowerDescription, occurrencesDescription);
578 }
579 else
580 {
581 value += find(words[index1], lowerDescription,
582 occurrencesDescription);
583 }
584 }
585 }
586
587 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
588 {
589 string lowerHeading = string(getHeadings()[index2].length(),
590 ' ');
591 for (unsigned number = 0; number <
592 getHeadings()[index2].length(); number++)
593 {
594 lowerHeading[number] = tolower(
595 getHeadings()[index2][number]);
596 }
597
598 for (unsigned number0 = 0; number0 < required.size(); number0++)
599 {
600 if (required[number0].find("URL ") == 0)
601 {
602 value += find(required[number0].substr(4),
603 lowerHeading);
604 }
605 else if (required[number0].find("TITLE ") == 0)
606 {
607 value += find(required[number0].substr(6),
608 lowerHeading);
609 }
610 else if (required[number0].find("TEXT ") == 0)
611 {
612 value += find(required[number0].substr(5),
613 lowerHeading);
614 }
615 else
616 {
617 value += find(required[number0], lowerHeading);
618 }
619 }
620
621 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
622 {
623 vector<string> words;
624
625 unsigned begin = 0, found;
626 do
627 {
628 found = eitherOr[number1].find(" OR ", begin);
629
630 if (found != string::npos)
631 {
632 words.push_back(eitherOr[number1].substr(begin,
633 found - begin));
634 }
635 else
636 {
637 words.push_back(eitherOr[number1].substr(begin));
638 }
639
640 begin = found + 4;
641 }
642 while (begin < eitherOr[number1].length() && found !=
643 string::npos);
644
645 for (unsigned number = 0; number < words.size(); number++)
646 {
647 if (words[number].find("URL ") == 0)
648 {
649 value += find(words[number].substr(4),
650 lowerHeading);
651 }
652 else if (words[number].find("TITLE ") == 0)
653 {
654 value += find(words[number].substr(6),
655 lowerHeading);
656 }
657 else if (words[number].find("TEXT ") == 0)
658 {
659 value += find(words[number].substr(5),
660 lowerHeading);
661 }
662 else
663 {
664 value += find(words[number], lowerHeading);
665 }
666 }
667 }
668 }
669 }
670 }
671 }
672
673 void Ranker::checkRequired()
674 {
675 vector<unsigned> inURLs, inTitles, inTexts;
676
677 for (unsigned index = 0; index < required.size(); index++)
678 {
679 unsigned inURL = 0, inTitle = 0, inText = 0;
680
681 if (required[index].find("URL ") == 0)
682 {
683 inURL = find(required[index].substr(4), lowerURL.substr(7));
684
685 if (inURL)
686 {
687 inTitle = find(required[index].substr(4), lowerTitle,
688 occurrencesTitle);
689 inText = find(required[index].substr(4), lowerText,
690 occurrencesText);
691
692 if (!inTitle) inTitle++;
693 if (!inText) inText++;
694 }
695 }
696 else if (required[index].find("TITLE ") == 0)
697 {
698 inTitle = find(required[index].substr(6), lowerTitle,
699 occurrencesTitle);
700
701 if (inTitle)
702 {
703 inURL = find(required[index].substr(6), lowerURL.substr(7));
704 inText = find(required[index].substr(6), lowerText,
705 occurrencesText);
706
707 if (!inURL) inURL++;
708 if (!inText) inText++;
709 }
710 }
711 else if (required[index].find("TEXT ") == 0)
712 {
713 inText = find(required[index].substr(5), lowerText,
714 occurrencesText);
715
716 if (inText)
717 {
718 inURL = find(required[index].substr(5), lowerURL.substr(7));
719 inTitle = find(required[index].substr(5), lowerTitle,
720 occurrencesTitle);
721
722 if (!inURL) inURL++;
723 if (!inTitle) inTitle++;
724 }
725 }
726 else
727 {
728 inURL = find(required[index], lowerURL.substr(7));
729 inTitle = find(required[index], lowerTitle, occurrencesTitle);
730 inText = find(required[index], lowerText, occurrencesText);
731 }
732
733 inURLs.push_back(inURL);
734 inTitles.push_back(inTitle);
735 inTexts.push_back(inText);
736 }
737
738 unsigned inURL = evaluate(inURLs);
739 unsigned inTitle = evaluate(inTitles);
740 unsigned inText = evaluate(inTexts);
741
742 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
743 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
744 inText : 0;
745 }
746
747 void Ranker::checkExcluded()
748 {
749 vector<unsigned> inURLs, inTitles, inTexts;
750
751 for (unsigned index = 0; index < excluded.size(); index++)
752 {
753 unsigned inURL = 0, inTitle = 0, inText = 0;
754
755 inURL = find(excluded[index], lowerURL.substr(7));
756 inTitle = find(excluded[index], lowerTitle);
757 inText = find(excluded[index], lowerText);
758
759 inURLs.push_back(inURL);
760 inTitles.push_back(inTitle);
761 inTexts.push_back(inText);
762 }
763
764 unsigned inURL = evaluate(inURLs);
765 unsigned inTitle = evaluate(inTitles);
766 unsigned inText = evaluate(inTexts);
767
768 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
769 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
770 inText : 0;
771 }
772
773 void Ranker::checkEitherOr()
774 {
775 vector<unsigned> inURLs, inTitles, inTexts;
776
777 for (unsigned index = 0; index < eitherOr.size(); index++)
778 {
779 vector<unsigned> inURLz, inTitlez, inTextz;
780 unsigned inURL = 0, inTitle = 0, inText = 0;
781 vector<string> words;
782
783 unsigned begin = 0, found;
784 do
785 {
786 found = eitherOr[index].find(" OR ", begin);
787
788 if (found != string::npos)
789 {
790 words.push_back(eitherOr[index].substr(begin, found - begin));
791 }
792 else
793 {
794 words.push_back(eitherOr[index].substr(begin));
795 }
796
797 begin = found + 4;
798 }
799 while (begin < eitherOr[index].length() && found != string::npos);
800
801 for (unsigned number = 0; number < words.size(); number++)
802 {
803 unsigned inURL = 0, inTitle = 0, inText = 0;
804
805 if (words[number].find("URL ") == 0)
806 {
807 inURL = find(words[number].substr(4), lowerURL.substr(7));
808
809 if (inURL)
810 {
811 inTitle = find(words[number].substr(4), lowerTitle,
812 occurrencesTitle);
813 inText = find(words[number].substr(4), lowerText,
814 occurrencesText);
815
816 if (!inTitle) inTitle++;
817 if (!inText) inText++;
818 }
819 }
820 else if (words[number].find("TITLE ") == 0)
821 {
822 inTitle = find(words[number].substr(6), lowerTitle,
823 occurrencesTitle);
824
825 if (inTitle)
826 {
827 inURL = find(words[number].substr(6), lowerURL.substr(7));
828 inText = find(words[number].substr(6), lowerText,
829 occurrencesText);
830
831 if (!inURL) inURL++;
832 if (!inText) inText++;
833 }
834 }
835 else if (words[number].find("TEXT ") == 0)
836 {
837 inText = find(words[number].substr(5), lowerText,
838 occurrencesText);
839
840 if (inText)
841 {
842 inURL = find(words[number].substr(5), lowerURL.substr(7));
843 inTitle = find(words[number].substr(5), lowerTitle,
844 occurrencesTitle);
845
846 if (!inURL) inURL++;
847 if (!inTitle) inTitle++;
848 }
849 }
850 else
851 {
852 inURL = find(words[number], lowerURL.substr(7));
853 inTitle = find(words[number], lowerTitle, occurrencesTitle);
854 inText = find(words[number], lowerText, occurrencesText);
855 }
856
857 inURLz.push_back(inURL);
858 inTitlez.push_back(inTitle);
859 inTextz.push_back(inText);
860 }
861
862 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
863 {
864 inURL += inURLz[number0];
865 }
866
867 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
868 {
869 inTitle += inTitlez[number1];
870 }
871
872 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
873 {
874 inText += inTextz[number2];
875 }
876
877 inURLs.push_back(inURL);
878 inTitles.push_back(inTitle);
879 inTexts.push_back(inText);
880
881 inURLz.clear();
882 inTitlez.clear();
883 inTextz.clear();
884 words.clear();
885 }
886
887 unsigned inURL = evaluate(inURLs);
888 unsigned inTitle = evaluate(inTitles);
889 unsigned inText = evaluate(inTexts);
890
891 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
892 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
893 inText : 0;
894 }
895
896 unsigned Ranker::find(string word, const string& where)
897 {
898 unsigned value = 0;
899
900 decrap(word);
901
902 if (word == "")
903 {
904 // this can happen if a word is all crap characters
905 value++;
906 }
907 else if (word.find_first_of(" \n ") == string::npos)
908 {
909 unsigned begin = 0, found;
910 do
911 {
912 found = where.find(word, begin);
913
914 if (found != string::npos)
915 {
916 bool isBefore, isAfter, before = false, after = false;
917 isBefore = found - 1 > 0;
918 isAfter = found + word.length() < where.length();
919
920 if (isBefore) before = isalnum(where[found - 1]) != 0;
921 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
922
923 if (!before && !after)
924 {
925 value++;
926 }
927 }
928
929 begin = found + word.length();
930 }
931 while (found != string::npos && begin < where.length());
932 }
933 else
934 {
935 value = phrase(word, where);
936 }
937
938 return value;
939 }
940
941 unsigned Ranker::find(string word, const string& where, map<unsigned,
942 unsigned>& occurrences)
943 {
944 unsigned value = 0;
945
946 decrap(word);
947
948 if (word == "")
949 {
950 // this can happen if a word is all crap characters
951 value++;
952 }
953 else if (word.find_first_of(" \n ") == string::npos)
954 {
955 unsigned begin = 0, found;
956 do
957 {
958 found = where.find(word, begin);
959
960 if (found != string::npos)
961 {
962 bool isBefore, isAfter, before = false, after = false;
963 isBefore = found - 1 > 0;
964 isAfter = found + word.length() < where.length();
965
966 if (isBefore) before = isalnum(where[found - 1]) != 0;
967 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
968
969 if (!before && !after)
970 {
971 value++;
972
973 occurrences.insert(pair<unsigned, unsigned>(found,
974 word.length()));
975 }
976 }
977
978 begin = found + word.length();
979 }
980 while (found != string::npos && begin < where.length());
981 }
982 else
983 {
984 value = phrase(word, where, occurrences);
985 }
986
987 return value;
988 }
989
990 unsigned Ranker::phrase(const string& phrase, const string& where)
991 {
992 unsigned value = 0;
993 vector<string> words;
994
995 unsigned begin = 0, space;
996 do
997 {
998 space = phrase.find(' ', begin);
999
1000 words.push_back(phrase.substr(begin, space - begin));
1001
1002 begin = space + 1;
1003 }
1004 while (space != string::npos && begin < phrase.length());
1005
1006 begin = 0;
1007 unsigned counter = 0;
1008 do
1009 {
1010 value += this->phrase(words, 0, begin, true, where);
1011 }
1012 while (begin < where.length());
1013
1014 return value;
1015 }
1016
1017 unsigned Ranker::phrase(const string& phrase, const string& where,
1018 map<unsigned, unsigned>& occurrences)
1019 {
1020 unsigned value = 0;
1021 vector<string> words;
1022
1023 unsigned begin = 0, space;
1024 do
1025 {
1026 space = phrase.find(' ', begin);
1027
1028 words.push_back(phrase.substr(begin, space - begin));
1029
1030 begin = space + 1;
1031 }
1032 while (space != string::npos && begin < phrase.length());
1033
1034 begin = 0;
1035 do
1036 {
1037 value += this->phrase(words, 0, begin, true, where, occurrences);
1038 }
1039 while (begin < where.length());
1040
1041 return value;
1042 }
1043
1044 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1045 begin, bool start, const string& where)
1046 {
1047 unsigned value = 0;
1048 bool end = !(word + 1 < words.size());
1049 unsigned found = where.find(words[word], begin);
1050 unsigned newBegin = found + words[word].length();
1051
1052 if (found != string::npos)
1053 {
1054 bool isBefore, isAfter, before = false, after = false;
1055 isBefore = found - 1 > 0;
1056 isAfter = found + words[word].length() < where.length();
1057
1058 if (isBefore) before = isalnum(where[found - 1]) != 0;
1059 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1060
1061 if (!before && !after)
1062 {
1063 bool between = true;
1064 if (!start)
1065 {
1066 for (unsigned index = begin + 1; index < found - 1; index++)
1067 {
1068 if (isalnum(where[index]))
1069 {
1070 between = false;
1071 break;
1072 }
1073 }
1074 }
1075
1076 if (between)
1077 {
1078 if (end)
1079 {
1080 begin = newBegin;
1081 value = 1;
1082 }
1083 else
1084 {
1085 value = phrase(words, (word + 1), newBegin, false, where);
1086 }
1087 }
1088 }
1089 }
1090
1091 if (start)
1092 {
1093 if (found != string::npos)
1094 {
1095 begin = newBegin;
1096 }
1097 else
1098 {
1099 begin = string::npos;
1100 }
1101 }
1102
1103 return value;
1104 }
1105
1106 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1107 begin, bool start, const string& where, map<unsigned, unsigned>&
1108 occurrences)
1109 {
1110 unsigned value = 0;
1111 bool end = !(word + 1 < words.size());
1112 unsigned found = where.find(words[word], begin);
1113 unsigned newBegin = found + words[word].length();
1114
1115 if (found != string::npos)
1116 {
1117 bool isBefore, isAfter, before = false, after = false;
1118 isBefore = found - 1 > 0;
1119 isAfter = found + words[word].length() < where.length();
1120
1121 if (isBefore) before = isalnum(where[found - 1]) != 0;
1122 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1123
1124 if (!before && !after)
1125 {
1126 bool between = true;
1127 if (!start)
1128 {
1129 for (unsigned index = begin + 1; index < found - 1; index++)
1130 {
1131 if (isalnum(where[index]))
1132 {
1133 between = false;
1134 break;
1135 }
1136 }
1137 }
1138
1139 if (between)
1140 {
1141 occurrences.insert(pair<unsigned, unsigned>(found,
1142 words[word].length()));
1143
1144 if (end)
1145 {
1146 begin = newBegin;
1147 value = 1;
1148 }
1149 else
1150 {
1151 value = phrase(words, (word + 1), newBegin, false, where,
1152 occurrences);
1153 }
1154 }
1155 }
1156 }
1157
1158 if (start)
1159 {
1160 if (found != string::npos)
1161 {
1162 begin = newBegin;
1163 }
1164 else
1165 {
1166 begin = string::npos;
1167 }
1168 }
1169
1170 return value;
1171 }
1172
1173 unsigned Ranker::evaluate(vector<unsigned>& ins)
1174 {
1175 unsigned in = 0;
1176
1177 for (unsigned index = 0; index < ins.size(); index++)
1178 {
1179 if (ins[index] > 0)
1180 {
1181 in += ins[index];
1182 }
1183 else
1184 {
1185 in = 0;
1186 break;
1187 }
1188 }
1189
1190 return in;
1191 }
1192
1193 void Ranker::decrap(string& crap)
1194 {
1195 unsigned begin = 0, found;
1196 do
1197 {
1198 // &, _, +, and # are not considered crap
1199 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1200
1201 if (found != string::npos)
1202 {
1203 crap[found] = ' ';
1204 }
1205
1206 begin = found + 1;
1207 }
1208 while (found != string::npos && begin < crap.length());
1209
1210 normalize(crap);
1211 }