ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 312
Committed: 2004-01-01T15:00:34-08:00 (21 years, 5 months ago) by douglas
File size: 26969 byte(s)
Log Message:
Updated copyright years.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id: Ranker.cpp,v 1.7 2004/01/01 23:00:34 douglas Exp $
50
51 #include "Ranker.h"
52
53 Ranker::Ranker()
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 Ranker::Ranker(Page& page) : Page(page)
63 {
64 value = 0;
65 requiredValue = 0;
66 excludedValue = 0;
67 eitherOrValue = 0;
68 allIn = all;
69 }
70
71 void Ranker::rank(vector<string> query)
72 {
73 vector<string> prep;
74
75 for (unsigned index = 0; index < query.size(); index++)
76 {
77 if (query[index] == "allintitle:" && index == 0)
78 {
79 allIn = title;
80 }
81 else if (query[index] == "allinurl:" && index == 0)
82 {
83 allIn = url;
84 }
85 else if (query[index] == "allintext:" && index == 0)
86 {
87 allIn = text;
88 }
89 else if (query[index].find("site:") == 0 && query[index].size() > 5)
90 {
91 site = query[index].substr(5);
92 }
93 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
94 {
95 prep.push_back("TITLE " + query[index].substr(8));
96 }
97 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
98 {
99 prep.push_back("URL " + query[index].substr(6));
100 }
101 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
102 {
103 prep.push_back("TEXT " + query[index].substr(7));
104 }
105 else
106 {
107 prep.push_back(query[index]);
108 }
109 }
110
111 if (prep.size() > 0)
112 {
113 bool or_ = false;
114 for (unsigned index = 0; index < prep.size(); index++)
115 {
116 bool exclude = false;
117 if (prep[index].find('+') == 0)
118 {
119 prep[index].erase(0, 1);
120 }
121 else if (prep[index].find('-') == 0)
122 {
123 exclude = true;
124 prep[index].erase(0, 1);
125 }
126
127 if (or_)
128 {
129 if (prep[index].find(" OR") == string::npos)
130 {
131 or_ = false;
132 }
133
134 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
135 }
136 else if (exclude)
137 {
138 excluded.push_back(prep[index]);
139 }
140 else if (prep[index].find(" OR") != string::npos)
141 {
142 or_ = true;
143 eitherOr.push_back(prep[index]);
144 }
145 else
146 {
147 required.push_back(prep[index]);
148 }
149 }
150 }
151
152 rank();
153 }
154
155 void Ranker::setSample()
156 {
157 map<unsigned, unsigned>::iterator itor;
158
159 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
160
161 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
162 {
163 unsigned distance;
164
165 if (++itor != occurrencesText.end())
166 {
167 unsigned next = itor->first;
168 itor--;
169
170 distance = next - (itor->first + itor->second);
171 }
172 else
173 {
174 distance = UINT_MAX;
175 itor--;
176 }
177
178 distances.insert(pair<unsigned, map<unsigned,
179 unsigned>::iterator>(distance, itor));
180 }
181
182 if (distances.begin() != distances.end())
183 {
184 itor = distances.begin()->second;
185 }
186
187 string portion;
188 unsigned sampleLength = 0, begin = 0, end = string::npos;
189 while (sampleLength < 160 && itor != occurrencesText.end())
190 {
191 unsigned found = itor->first;
192 unsigned length = itor->second;
193
194 for (unsigned index = found; index > begin; index--)
195 {
196 if (index == begin) cerr << "Oh crap, I'm insane!\n";
197 if (found - index >= 160 - sampleLength - length)
198 {
199 for (; index < found; index++)
200 {
201 if (isspace(getText()[index])) break;
202 }
203 begin = index + 1;
204 break;
205 }
206 else if ((index > begin ? (isupper(getText()[index]) &&
207 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
208 index != found)
209 {
210 begin = index;
211 break;
212 }
213 }
214
215 if (end + 1 != begin) sample += " <strong>...</strong> ";
216
217 portion = getText().substr(begin, found - begin);
218 sampleLength += portion.length();
219
220 entities(portion, '&', "&amp;");
221 entities(portion, '\"', "&quot;");
222 entities(portion, '<', "&lt;");
223 entities(portion, '>', "&gt;");
224
225 sample += portion + "<strong>";
226
227 portion = getText().substr(found, length);
228 sampleLength += portion.length();
229
230 entities(portion, '&', "&amp;");
231 entities(portion, '\"', "&quot;");
232 entities(portion, '<', "&lt;");
233 entities(portion, '>', "&gt;");
234
235 sample += portion + "</strong>";
236
237 begin = found + length;
238 end = begin - 1;
239
240 if (++itor != occurrencesText.end())
241 {
242 if (itor->first + itor->second < begin + 160 - sampleLength)
243 {
244 portion = getText().substr(begin, itor->first - begin);
245 sampleLength += portion.length();
246
247 entities(portion, '&', "&amp;");
248 entities(portion, '\"', "&quot;");
249 entities(portion, '<', "&lt;");
250 entities(portion, '>', "&gt;");
251
252 sample += portion;
253
254 begin = itor->first;
255 end = begin - 1;
256 }
257 else
258 {
259 for (end = begin + 160 - sampleLength; end > begin; end--)
260 {
261 if (isspace(getText()[end])) break;
262 }
263
264 portion = getText().substr(begin, end - begin + 1);
265 sampleLength += portion.length();
266
267 entities(portion, '&', "&amp;");
268 entities(portion, '\"', "&quot;");
269 entities(portion, '<', "&lt;");
270 entities(portion, '>', "&gt;");
271
272 sample += portion + " <strong>...</strong>";
273
274 break;
275 }
276 }
277 else
278 {
279 for (end = begin + 160 - sampleLength; end > begin && (end + 1 <
280 getText().length()); end--)
281 {
282 if (isspace(getText()[end])) break;
283 }
284
285 if (end >= getText().length()) end = getText().length() - 1;
286
287 portion = getText().substr(begin, end - begin + 1);
288 sampleLength += portion.length();
289
290 entities(portion, '&', "&amp;");
291 entities(portion, '\"', "&quot;");
292 entities(portion, '<', "&lt;");
293 entities(portion, '>', "&gt;");
294
295 sample += portion;
296
297 if (end + 1 < getText().length())
298 {
299 sample += " <strong>...</strong>";
300 }
301
302 break;
303 }
304 }
305 }
306
307 string Ranker::getTitle()
308 {
309 string title, portion;
310
311 unsigned begin = 0;
312 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
313 itor != occurrencesTitle.end(); itor++)
314 {
315 unsigned found = itor->first;
316 unsigned length = itor->second;
317
318 portion = Page::getTitle().substr(begin, found - begin);
319
320 entities(portion, '&', "&amp;");
321 entities(portion, '\"', "&quot;");
322 entities(portion, '<', "&lt;");
323 entities(portion, '>', "&gt;");
324
325 title += portion + "<strong>";
326
327 portion = Page::getTitle().substr(found, length);
328
329 entities(portion, '&', "&amp;");
330 entities(portion, '\"', "&quot;");
331 entities(portion, '<', "&lt;");
332 entities(portion, '>', "&gt;");
333
334 title += portion + "</strong>";
335
336 begin = found + length;
337 }
338
339 portion = Page::getTitle().substr(begin);
340
341 entities(portion, '&', "&amp;");
342 entities(portion, '\"', "&quot;");
343 entities(portion, '<', "&lt;");
344 entities(portion, '>', "&gt;");
345
346 title += portion;
347
348 return title;
349 }
350
351 string Ranker::getDescription()
352 {
353 string description, portion;
354
355 unsigned begin = 0;
356 for (map<unsigned, unsigned>::iterator itor =
357 occurrencesDescription.begin(); itor != occurrencesDescription.end();
358 itor++)
359 {
360 unsigned found = itor->first;
361 unsigned length = itor->second;
362
363 portion = Page::getDescription().substr(begin, found - begin);
364
365 entities(portion, '&', "&amp;");
366 entities(portion, '\"', "&quot;");
367 entities(portion, '<', "&lt;");
368 entities(portion, '>', "&gt;");
369
370 description += portion + "<strong>";
371
372 portion = Page::getDescription().substr(found, length);
373
374 entities(portion, '&', "&amp;");
375 entities(portion, '\"', "&quot;");
376 entities(portion, '<', "&lt;");
377 entities(portion, '>', "&gt;");
378
379 description += portion + "</strong>";
380
381 begin = found + length;
382 }
383
384 portion = Page::getDescription().substr(begin);
385
386 entities(portion, '&', "&amp;");
387 entities(portion, '\"', "&quot;");
388 entities(portion, '<', "&lt;");
389 entities(portion, '>', "&gt;");
390
391 description += portion;
392
393 return description;
394 }
395
396 bool Ranker::operator==(const unsigned number) const
397 {
398 return value == number;
399 }
400
401 bool Ranker::operator==(const Ranker& ranker) const
402 {
403 return value == ranker.value;
404 }
405
406 bool Ranker::operator!=(const unsigned number) const
407 {
408 return value != number;
409 }
410
411 bool Ranker::operator!=(const Ranker& ranker) const
412 {
413 return value != ranker.value;
414 }
415
416 bool Ranker::operator<(const unsigned number) const
417 {
418 return value < number;
419 }
420
421 bool Ranker::operator<(const Ranker& ranker) const
422 {
423 return value < ranker.value;
424 }
425
426 bool Ranker::operator>(const unsigned number) const
427 {
428 return value > number;
429 }
430
431 bool Ranker::operator >(const Ranker& ranker) const
432 {
433 return value > ranker.value;
434 }
435
436 void Ranker::rank()
437 {
438 lowerAddress = tolower(getAddress());
439
440 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
441 site.length())
442 {
443 bool isRequired = required.size() > 0;
444 bool isExcluded = excluded.size() > 0;
445 bool isEitherOr = eitherOr.size() > 0;
446
447 lowerURL = tolower(getURL());
448 lowerTitle = tolower(Page::getTitle());
449 lowerText = tolower(Page::getText());
450
451 if (isRequired) checkRequired();
452 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
453 if (isEitherOr) checkEitherOr();
454
455 if (isRequired && isExcluded && isEitherOr)
456 {
457 value += requiredValue && !excludedValue && eitherOrValue ?
458 requiredValue + eitherOrValue : 0;
459 }
460 else if (isRequired && isExcluded)
461 {
462 value += requiredValue && !excludedValue ? requiredValue : 0;
463 }
464 else if (isRequired && isEitherOr)
465 {
466 value += requiredValue && eitherOrValue ? requiredValue +
467 eitherOrValue : 0;
468 }
469 else if (isExcluded && isEitherOr)
470 {
471 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
472 }
473 else if (isRequired)
474 {
475 value += requiredValue;
476 }
477 else if (isEitherOr)
478 {
479 value += eitherOrValue;
480 }
481 else
482 {
483 // do nothing this is a bad search and warrants no results
484 }
485
486 if (value > 0)
487 {
488 string lowerDescription = tolower(Page::getDescription());
489
490 for (unsigned index = 0; index < required.size(); index++)
491 {
492 if (required[index].find("URL ") == 0)
493 {
494 value += find(required[index].substr(4), lowerDescription,
495 occurrencesDescription);
496 }
497 else if (required[index].find("TITLE ") == 0)
498 {
499 value += find(required[index].substr(6), lowerDescription,
500 occurrencesDescription);
501 }
502 else if (required[index].find("TEXT ") == 0)
503 {
504 value += find(required[index].substr(5), lowerDescription,
505 occurrencesDescription);
506 }
507 else
508 {
509 value += find(required[index], lowerDescription,
510 occurrencesDescription);
511 }
512 }
513
514 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
515 {
516 vector<string> words;
517
518 unsigned begin = 0, found;
519 do
520 {
521 found = eitherOr[index1].find(" OR ", begin);
522
523 if (found != string::npos)
524 {
525 words.push_back(eitherOr[index1].substr(begin, found -
526 begin));
527 }
528 else
529 {
530 words.push_back(eitherOr[index1].substr(begin));
531 }
532
533 begin = found + 4;
534 }
535 while (begin < eitherOr[index1].length() && found !=
536 string::npos);
537
538 for (unsigned number = 0; number < words.size(); number++)
539 {
540 if (words[index1].find("URL ") == 0)
541 {
542 value += find(words[index1].substr(4),
543 lowerDescription, occurrencesDescription);
544 }
545 else if (words[index1].find("TITLE ") == 0)
546 {
547 value += find(words[index1].substr(6),
548 lowerDescription, occurrencesDescription);
549 }
550 else if (words[index1].find("TEXT ") == 0)
551 {
552 value += find(words[index1].substr(5),
553 lowerDescription, occurrencesDescription);
554 }
555 else
556 {
557 value += find(words[index1], lowerDescription,
558 occurrencesDescription);
559 }
560 }
561 }
562
563 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
564 {
565 string lowerHeading = string(getHeadings()[index2].length(),
566 ' ');
567 for (unsigned number = 0; number <
568 getHeadings()[index2].length(); number++)
569 {
570 lowerHeading[number] = tolower(
571 getHeadings()[index2][number]);
572 }
573
574 for (unsigned number0 = 0; number0 < required.size(); number0++)
575 {
576 if (required[number0].find("URL ") == 0)
577 {
578 value += find(required[number0].substr(4),
579 lowerHeading);
580 }
581 else if (required[number0].find("TITLE ") == 0)
582 {
583 value += find(required[number0].substr(6),
584 lowerHeading);
585 }
586 else if (required[number0].find("TEXT ") == 0)
587 {
588 value += find(required[number0].substr(5),
589 lowerHeading);
590 }
591 else
592 {
593 value += find(required[number0], lowerHeading);
594 }
595 }
596
597 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
598 {
599 vector<string> words;
600
601 unsigned begin = 0, found;
602 do
603 {
604 found = eitherOr[number1].find(" OR ", begin);
605
606 if (found != string::npos)
607 {
608 words.push_back(eitherOr[number1].substr(begin,
609 found - begin));
610 }
611 else
612 {
613 words.push_back(eitherOr[number1].substr(begin));
614 }
615
616 begin = found + 4;
617 }
618 while (begin < eitherOr[number1].length() && found !=
619 string::npos);
620
621 for (unsigned number = 0; number < words.size(); number++)
622 {
623 if (words[number].find("URL ") == 0)
624 {
625 value += find(words[number].substr(4),
626 lowerHeading);
627 }
628 else if (words[number].find("TITLE ") == 0)
629 {
630 value += find(words[number].substr(6),
631 lowerHeading);
632 }
633 else if (words[number].find("TEXT ") == 0)
634 {
635 value += find(words[number].substr(5),
636 lowerHeading);
637 }
638 else
639 {
640 value += find(words[number], lowerHeading);
641 }
642 }
643 }
644 }
645 }
646 }
647 }
648
649 void Ranker::checkRequired()
650 {
651 vector<unsigned> inURLs, inTitles, inTexts;
652
653 for (unsigned index = 0; index < required.size(); index++)
654 {
655 unsigned inURL = 0, inTitle = 0, inText = 0;
656
657 if (required[index].find("URL ") == 0)
658 {
659 inURL = find(required[index].substr(4), lowerURL.substr(7));
660
661 if (inURL)
662 {
663 inTitle = find(required[index].substr(4), lowerTitle,
664 occurrencesTitle);
665 inText = find(required[index].substr(4), lowerText,
666 occurrencesText);
667
668 if (!inTitle) inTitle++;
669 if (!inText) inText++;
670 }
671 }
672 else if (required[index].find("TITLE ") == 0)
673 {
674 inTitle = find(required[index].substr(6), lowerTitle,
675 occurrencesTitle);
676
677 if (inTitle)
678 {
679 inURL = find(required[index].substr(6), lowerURL.substr(7));
680 inText = find(required[index].substr(6), lowerText,
681 occurrencesText);
682
683 if (!inURL) inURL++;
684 if (!inText) inText++;
685 }
686 }
687 else if (required[index].find("TEXT ") == 0)
688 {
689 inText = find(required[index].substr(5), lowerText,
690 occurrencesText);
691
692 if (inText)
693 {
694 inURL = find(required[index].substr(5), lowerURL.substr(7));
695 inTitle = find(required[index].substr(5), lowerTitle,
696 occurrencesTitle);
697
698 if (!inURL) inURL++;
699 if (!inTitle) inTitle++;
700 }
701 }
702 else
703 {
704 inURL = find(required[index], lowerURL.substr(7));
705 inTitle = find(required[index], lowerTitle, occurrencesTitle);
706 inText = find(required[index], lowerText, occurrencesText);
707 }
708
709 inURLs.push_back(inURL);
710 inTitles.push_back(inTitle);
711 inTexts.push_back(inText);
712 }
713
714 unsigned inURL = evaluate(inURLs);
715 unsigned inTitle = evaluate(inTitles);
716 unsigned inText = evaluate(inTexts);
717
718 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
719 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
720 inText : 0;
721 }
722
723 void Ranker::checkExcluded()
724 {
725 vector<unsigned> inURLs, inTitles, inTexts;
726
727 for (unsigned index = 0; index < excluded.size(); index++)
728 {
729 unsigned inURL = 0, inTitle = 0, inText = 0;
730
731 inURL = find(excluded[index], lowerURL.substr(7));
732 inTitle = find(excluded[index], lowerTitle);
733 inText = find(excluded[index], lowerText);
734
735 inURLs.push_back(inURL);
736 inTitles.push_back(inTitle);
737 inTexts.push_back(inText);
738 }
739
740 unsigned inURL = evaluate(inURLs);
741 unsigned inTitle = evaluate(inTitles);
742 unsigned inText = evaluate(inTexts);
743
744 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
745 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
746 inText : 0;
747 }
748
749 void Ranker::checkEitherOr()
750 {
751 vector<unsigned> inURLs, inTitles, inTexts;
752
753 for (unsigned index = 0; index < eitherOr.size(); index++)
754 {
755 vector<unsigned> inURLz, inTitlez, inTextz;
756 unsigned inURL = 0, inTitle = 0, inText = 0;
757 vector<string> words;
758
759 unsigned begin = 0, found;
760 do
761 {
762 found = eitherOr[index].find(" OR ", begin);
763
764 if (found != string::npos)
765 {
766 words.push_back(eitherOr[index].substr(begin, found - begin));
767 }
768 else
769 {
770 words.push_back(eitherOr[index].substr(begin));
771 }
772
773 begin = found + 4;
774 }
775 while (begin < eitherOr[index].length() && found != string::npos);
776
777 for (unsigned number = 0; number < words.size(); number++)
778 {
779 unsigned inURL = 0, inTitle = 0, inText = 0;
780
781 if (words[number].find("URL ") == 0)
782 {
783 inURL = find(words[number].substr(4), lowerURL.substr(7));
784
785 if (inURL)
786 {
787 inTitle = find(words[number].substr(4), lowerTitle,
788 occurrencesTitle);
789 inText = find(words[number].substr(4), lowerText,
790 occurrencesText);
791
792 if (!inTitle) inTitle++;
793 if (!inText) inText++;
794 }
795 }
796 else if (words[number].find("TITLE ") == 0)
797 {
798 inTitle = find(words[number].substr(6), lowerTitle,
799 occurrencesTitle);
800
801 if (inTitle)
802 {
803 inURL = find(words[number].substr(6), lowerURL.substr(7));
804 inText = find(words[number].substr(6), lowerText,
805 occurrencesText);
806
807 if (!inURL) inURL++;
808 if (!inText) inText++;
809 }
810 }
811 else if (words[number].find("TEXT ") == 0)
812 {
813 inText = find(words[number].substr(5), lowerText,
814 occurrencesText);
815
816 if (inText)
817 {
818 inURL = find(words[number].substr(5), lowerURL.substr(7));
819 inTitle = find(words[number].substr(5), lowerTitle,
820 occurrencesTitle);
821
822 if (!inURL) inURL++;
823 if (!inTitle) inTitle++;
824 }
825 }
826 else
827 {
828 inURL = find(words[number], lowerURL.substr(7));
829 inTitle = find(words[number], lowerTitle, occurrencesTitle);
830 inText = find(words[number], lowerText, occurrencesText);
831 }
832
833 inURLz.push_back(inURL);
834 inTitlez.push_back(inTitle);
835 inTextz.push_back(inText);
836 }
837
838 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
839 {
840 inURL += inURLz[number0];
841 }
842
843 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
844 {
845 inTitle += inTitlez[number1];
846 }
847
848 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
849 {
850 inText += inTextz[number2];
851 }
852
853 inURLs.push_back(inURL);
854 inTitles.push_back(inTitle);
855 inTexts.push_back(inText);
856
857 inURLz.clear();
858 inTitlez.clear();
859 inTextz.clear();
860 words.clear();
861 }
862
863 unsigned inURL = evaluate(inURLs);
864 unsigned inTitle = evaluate(inTitles);
865 unsigned inText = evaluate(inTexts);
866
867 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
868 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
869 inText : 0;
870 }
871
872 unsigned Ranker::find(string word, const string& where)
873 {
874 unsigned value = 0;
875
876 decrap(word);
877
878 if (word == "")
879 {
880 // this can happen if a word is all crap characters
881 value++;
882 }
883 else if (word.find_first_of(" \n ") == string::npos)
884 {
885 unsigned begin = 0, found;
886 do
887 {
888 found = where.find(word, begin);
889
890 if (found != string::npos)
891 {
892 bool isBefore, isAfter, before = false, after = false;
893 isBefore = found - 1 > 0;
894 isAfter = found + word.length() < where.length();
895
896 if (isBefore) before = isalnum(where[found - 1]) != 0;
897 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
898
899 if (!before && !after)
900 {
901 value++;
902 }
903 }
904
905 begin = found + word.length();
906 }
907 while (found != string::npos && begin < where.length());
908 }
909 else
910 {
911 value = phrase(word, where);
912 }
913
914 return value;
915 }
916
917 unsigned Ranker::find(string word, const string& where, map<unsigned,
918 unsigned>& occurrences)
919 {
920 unsigned value = 0;
921
922 decrap(word);
923
924 if (word == "")
925 {
926 // this can happen if a word is all crap characters
927 value++;
928 }
929 else if (word.find_first_of(" \n ") == string::npos)
930 {
931 unsigned begin = 0, found;
932 do
933 {
934 found = where.find(word, begin);
935
936 if (found != string::npos)
937 {
938 bool isBefore, isAfter, before = false, after = false;
939 isBefore = found - 1 > 0;
940 isAfter = found + word.length() < where.length();
941
942 if (isBefore) before = isalnum(where[found - 1]) != 0;
943 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
944
945 if (!before && !after)
946 {
947 value++;
948
949 occurrences.insert(pair<unsigned, unsigned>(found,
950 word.length()));
951 }
952 }
953
954 begin = found + word.length();
955 }
956 while (found != string::npos && begin < where.length());
957 }
958 else
959 {
960 value = phrase(word, where, occurrences);
961 }
962
963 return value;
964 }
965
966 unsigned Ranker::phrase(const string& phrase, const string& where)
967 {
968 unsigned value = 0;
969 vector<string> words;
970
971 unsigned begin = 0, space;
972 do
973 {
974 space = phrase.find(' ', begin);
975
976 words.push_back(phrase.substr(begin, space - begin));
977
978 begin = space + 1;
979 }
980 while (space != string::npos && begin < phrase.length());
981
982 begin = 0;
983 unsigned counter = 0;
984 do
985 {
986 value += this->phrase(words, 0, begin, true, where);
987 }
988 while (begin < where.length());
989
990 return value;
991 }
992
993 unsigned Ranker::phrase(const string& phrase, const string& where,
994 map<unsigned, unsigned>& occurrences)
995 {
996 unsigned value = 0;
997 vector<string> words;
998
999 unsigned begin = 0, space;
1000 do
1001 {
1002 space = phrase.find(' ', begin);
1003
1004 words.push_back(phrase.substr(begin, space - begin));
1005
1006 begin = space + 1;
1007 }
1008 while (space != string::npos && begin < phrase.length());
1009
1010 begin = 0;
1011 do
1012 {
1013 value += this->phrase(words, 0, begin, true, where, occurrences);
1014 }
1015 while (begin < where.length());
1016
1017 return value;
1018 }
1019
1020 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1021 begin, bool start, const string& where)
1022 {
1023 unsigned value = 0;
1024 bool end = !(word + 1 < words.size());
1025 unsigned found = where.find(words[word], begin);
1026 unsigned newBegin = found + words[word].length();
1027
1028 if (found != string::npos)
1029 {
1030 bool isBefore, isAfter, before = false, after = false;
1031 isBefore = found - 1 > 0;
1032 isAfter = found + words[word].length() < where.length();
1033
1034 if (isBefore) before = isalnum(where[found - 1]) != 0;
1035 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1036
1037 if (!before && !after)
1038 {
1039 bool between = true;
1040 if (!start)
1041 {
1042 for (unsigned index = begin + 1; index < found - 1; index++)
1043 {
1044 if (isalnum(where[index]))
1045 {
1046 between = false;
1047 break;
1048 }
1049 }
1050 }
1051
1052 if (between)
1053 {
1054 if (end)
1055 {
1056 begin = newBegin;
1057 value = 1;
1058 }
1059 else
1060 {
1061 value = phrase(words, (word + 1), newBegin, false, where);
1062 }
1063 }
1064 }
1065 }
1066
1067 if (start)
1068 {
1069 if (found != string::npos)
1070 {
1071 begin = newBegin;
1072 }
1073 else
1074 {
1075 begin = string::npos;
1076 }
1077 }
1078
1079 return value;
1080 }
1081
1082 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1083 begin, bool start, const string& where, map<unsigned, unsigned>&
1084 occurrences)
1085 {
1086 unsigned value = 0;
1087 bool end = !(word + 1 < words.size());
1088 unsigned found = where.find(words[word], begin);
1089 unsigned newBegin = found + words[word].length();
1090
1091 if (found != string::npos)
1092 {
1093 bool isBefore, isAfter, before = false, after = false;
1094 isBefore = found - 1 > 0;
1095 isAfter = found + words[word].length() < where.length();
1096
1097 if (isBefore) before = isalnum(where[found - 1]) != 0;
1098 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1099
1100 if (!before && !after)
1101 {
1102 bool between = true;
1103 if (!start)
1104 {
1105 for (unsigned index = begin + 1; index < found - 1; index++)
1106 {
1107 if (isalnum(where[index]))
1108 {
1109 between = false;
1110 break;
1111 }
1112 }
1113 }
1114
1115 if (between)
1116 {
1117 occurrences.insert(pair<unsigned, unsigned>(found,
1118 words[word].length()));
1119
1120 if (end)
1121 {
1122 begin = newBegin;
1123 value = 1;
1124 }
1125 else
1126 {
1127 value = phrase(words, (word + 1), newBegin, false, where,
1128 occurrences);
1129 }
1130 }
1131 }
1132 }
1133
1134 if (start)
1135 {
1136 if (found != string::npos)
1137 {
1138 begin = newBegin;
1139 }
1140 else
1141 {
1142 begin = string::npos;
1143 }
1144 }
1145
1146 return value;
1147 }
1148
1149 unsigned Ranker::evaluate(vector<unsigned>& ins)
1150 {
1151 unsigned in = 0;
1152
1153 for (unsigned index = 0; index < ins.size(); index++)
1154 {
1155 if (ins[index] > 0)
1156 {
1157 in += ins[index];
1158 }
1159 else
1160 {
1161 in = 0;
1162 break;
1163 }
1164 }
1165
1166 return in;
1167 }
1168
1169 void Ranker::decrap(string& crap)
1170 {
1171 unsigned begin = 0, found;
1172 do
1173 {
1174 // &, _, +, and # are not considered crap
1175 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1176
1177 if (found != string::npos)
1178 {
1179 crap[found] = ' ';
1180 }
1181
1182 begin = found + 1;
1183 }
1184 while (found != string::npos && begin < crap.length());
1185
1186 normalize(crap);
1187 }