ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 339
Committed: 2004-04-16T14:45:30-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 26816 byte(s)
Log Message:
Moof!

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Ranker.hpp"
52
53 Ranker::Ranker(Page& page) : Page(page)
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 void Ranker::rank(vector<string> query)
63 {
64 vector<string> prep;
65
66 for (unsigned index = 0; index < query.size(); index++)
67 {
68 if (query[index] == "allintitle:" && index == 0)
69 {
70 allIn = title;
71 }
72 else if (query[index] == "allinurl:" && index == 0)
73 {
74 allIn = url;
75 }
76 else if (query[index] == "allintext:" && index == 0)
77 {
78 allIn = text;
79 }
80 else if (query[index].find("site:") == 0 && query[index].size() > 5)
81 {
82 site = query[index].substr(5);
83 }
84 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
85 {
86 prep.push_back("TITLE " + query[index].substr(8));
87 }
88 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
89 {
90 prep.push_back("URL " + query[index].substr(6));
91 }
92 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
93 {
94 prep.push_back("TEXT " + query[index].substr(7));
95 }
96 else
97 {
98 prep.push_back(query[index]);
99 }
100 }
101
102 if (prep.size() > 0)
103 {
104 bool or_ = false;
105 for (unsigned index = 0; index < prep.size(); index++)
106 {
107 bool exclude = false;
108 if (prep[index].find('+') == 0)
109 {
110 prep[index].erase(0, 1);
111 }
112 else if (prep[index].find('-') == 0)
113 {
114 exclude = true;
115 prep[index].erase(0, 1);
116 }
117
118 if (or_)
119 {
120 if (prep[index].find(" OR") == string::npos)
121 {
122 or_ = false;
123 }
124
125 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
126 }
127 else if (exclude)
128 {
129 excluded.push_back(prep[index]);
130 }
131 else if (prep[index].find(" OR") != string::npos)
132 {
133 or_ = true;
134 eitherOr.push_back(prep[index]);
135 }
136 else
137 {
138 required.push_back(prep[index]);
139 }
140 }
141 }
142
143 rank();
144 }
145
146 void Ranker::setSample()
147 {
148 map<unsigned, unsigned>::iterator itor;
149
150 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
151
152 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
153 {
154 unsigned distance;
155
156 if (++itor != occurrencesText.end())
157 {
158 unsigned next = itor->first;
159 itor--;
160
161 distance = next - (itor->first + itor->second);
162 }
163 else
164 {
165 distance = string::npos;
166 itor--;
167 }
168
169 distances.insert(pair<unsigned, map<unsigned,
170 unsigned>::iterator>(distance, itor));
171 }
172
173 if (distances.begin() != distances.end())
174 {
175 itor = distances.begin()->second;
176 }
177
178 string portion;
179 unsigned sampleLength = 0, begin = 0, end = string::npos;
180 while (sampleLength < 160 && itor != occurrencesText.end())
181 {
182 unsigned found = itor->first;
183 unsigned length = itor->second;
184
185 for (unsigned index = found; index > begin; index--)
186 {
187 if (index == begin) cerr << "Oh crap, I'm insane!\n";
188 if (found - index >= 160 - sampleLength - length)
189 {
190 for (; index < found; index++)
191 {
192 if (isspace(getText()[index])) break;
193 }
194 begin = index + 1;
195 break;
196 }
197 else if ((index > begin ? (isupper(getText()[index]) &&
198 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
199 index != found)
200 {
201 begin = index;
202 break;
203 }
204 }
205
206 if (end + 1 != begin) sample += " <strong>...</strong> ";
207
208 portion = getText().substr(begin, found - begin);
209 sampleLength += portion.length();
210
211 entities(portion, '&', "&amp;");
212 entities(portion, '\"', "&quot;");
213 entities(portion, '<', "&lt;");
214 entities(portion, '>', "&gt;");
215
216 sample += portion + "<strong>";
217
218 portion = getText().substr(found, length);
219 sampleLength += portion.length();
220
221 entities(portion, '&', "&amp;");
222 entities(portion, '\"', "&quot;");
223 entities(portion, '<', "&lt;");
224 entities(portion, '>', "&gt;");
225
226 sample += portion + "</strong>";
227
228 begin = found + length;
229 end = begin - 1;
230
231 if (++itor != occurrencesText.end())
232 {
233 if (itor->first + itor->second < begin + 160 - sampleLength)
234 {
235 portion = getText().substr(begin, itor->first - begin);
236 sampleLength += portion.length();
237
238 entities(portion, '&', "&amp;");
239 entities(portion, '\"', "&quot;");
240 entities(portion, '<', "&lt;");
241 entities(portion, '>', "&gt;");
242
243 sample += portion;
244
245 begin = itor->first;
246 end = begin - 1;
247 }
248 else
249 {
250 for (end = begin + 160 - sampleLength; end > begin; end--)
251 {
252 if (isspace(getText()[end])) break;
253 }
254
255 portion = getText().substr(begin, end - begin + 1);
256 sampleLength += portion.length();
257
258 entities(portion, '&', "&amp;");
259 entities(portion, '\"', "&quot;");
260 entities(portion, '<', "&lt;");
261 entities(portion, '>', "&gt;");
262
263 sample += portion + " <strong>...</strong>";
264
265 break;
266 }
267 }
268 else
269 {
270 for (end = begin + 160 - sampleLength; end > begin && (end + 1 <
271 getText().length()); end--)
272 {
273 if (isspace(getText()[end])) break;
274 }
275
276 if (end >= getText().length()) end = getText().length() - 1;
277
278 portion = getText().substr(begin, end - begin + 1);
279 sampleLength += portion.length();
280
281 entities(portion, '&', "&amp;");
282 entities(portion, '\"', "&quot;");
283 entities(portion, '<', "&lt;");
284 entities(portion, '>', "&gt;");
285
286 sample += portion;
287
288 if (end + 1 < getText().length())
289 {
290 sample += " <strong>...</strong>";
291 }
292
293 break;
294 }
295 }
296 }
297
298 string Ranker::getTitle()
299 {
300 string title, portion;
301
302 unsigned begin = 0;
303 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
304 itor != occurrencesTitle.end(); itor++)
305 {
306 unsigned found = itor->first;
307 unsigned length = itor->second;
308
309 portion = Page::getTitle().substr(begin, found - begin);
310
311 entities(portion, '&', "&amp;");
312 entities(portion, '\"', "&quot;");
313 entities(portion, '<', "&lt;");
314 entities(portion, '>', "&gt;");
315
316 title += portion + "<strong>";
317
318 portion = Page::getTitle().substr(found, length);
319
320 entities(portion, '&', "&amp;");
321 entities(portion, '\"', "&quot;");
322 entities(portion, '<', "&lt;");
323 entities(portion, '>', "&gt;");
324
325 title += portion + "</strong>";
326
327 begin = found + length;
328 }
329
330 portion = Page::getTitle().substr(begin);
331
332 entities(portion, '&', "&amp;");
333 entities(portion, '\"', "&quot;");
334 entities(portion, '<', "&lt;");
335 entities(portion, '>', "&gt;");
336
337 title += portion;
338
339 return title;
340 }
341
342 string Ranker::getDescription()
343 {
344 string description, portion;
345
346 unsigned begin = 0;
347 for (map<unsigned, unsigned>::iterator itor =
348 occurrencesDescription.begin(); itor != occurrencesDescription.end();
349 itor++)
350 {
351 unsigned found = itor->first;
352 unsigned length = itor->second;
353
354 portion = Page::getDescription().substr(begin, found - begin);
355
356 entities(portion, '&', "&amp;");
357 entities(portion, '\"', "&quot;");
358 entities(portion, '<', "&lt;");
359 entities(portion, '>', "&gt;");
360
361 description += portion + "<strong>";
362
363 portion = Page::getDescription().substr(found, length);
364
365 entities(portion, '&', "&amp;");
366 entities(portion, '\"', "&quot;");
367 entities(portion, '<', "&lt;");
368 entities(portion, '>', "&gt;");
369
370 description += portion + "</strong>";
371
372 begin = found + length;
373 }
374
375 portion = Page::getDescription().substr(begin);
376
377 entities(portion, '&', "&amp;");
378 entities(portion, '\"', "&quot;");
379 entities(portion, '<', "&lt;");
380 entities(portion, '>', "&gt;");
381
382 description += portion;
383
384 return description;
385 }
386
387 bool Ranker::operator==(const unsigned number) const
388 {
389 return value == number;
390 }
391
392 bool Ranker::operator==(const Ranker& ranker) const
393 {
394 return value == ranker.value;
395 }
396
397 bool Ranker::operator!=(const unsigned number) const
398 {
399 return value != number;
400 }
401
402 bool Ranker::operator!=(const Ranker& ranker) const
403 {
404 return value != ranker.value;
405 }
406
407 bool Ranker::operator<(const unsigned number) const
408 {
409 return value < number;
410 }
411
412 bool Ranker::operator<(const Ranker& ranker) const
413 {
414 return value < ranker.value;
415 }
416
417 bool Ranker::operator>(const unsigned number) const
418 {
419 return value > number;
420 }
421
422 bool Ranker::operator >(const Ranker& ranker) const
423 {
424 return value > ranker.value;
425 }
426
427 void Ranker::rank()
428 {
429 lowerAddress = tolower(getAddress());
430
431 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
432 site.length())
433 {
434 bool isRequired = required.size() > 0;
435 bool isExcluded = excluded.size() > 0;
436 bool isEitherOr = eitherOr.size() > 0;
437
438 lowerURL = tolower(getURL());
439 lowerTitle = tolower(Page::getTitle());
440 lowerText = tolower(Page::getText());
441
442 if (isRequired) checkRequired();
443 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
444 if (isEitherOr) checkEitherOr();
445
446 if (isRequired && isExcluded && isEitherOr)
447 {
448 value += requiredValue && !excludedValue && eitherOrValue ?
449 requiredValue + eitherOrValue : 0;
450 }
451 else if (isRequired && isExcluded)
452 {
453 value += requiredValue && !excludedValue ? requiredValue : 0;
454 }
455 else if (isRequired && isEitherOr)
456 {
457 value += requiredValue && eitherOrValue ? requiredValue +
458 eitherOrValue : 0;
459 }
460 else if (isExcluded && isEitherOr)
461 {
462 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
463 }
464 else if (isRequired)
465 {
466 value += requiredValue;
467 }
468 else if (isEitherOr)
469 {
470 value += eitherOrValue;
471 }
472 else
473 {
474 // do nothing this is a bad search and warrants no results
475 }
476
477 if (value > 0)
478 {
479 string lowerDescription = tolower(Page::getDescription());
480
481 for (unsigned index = 0; index < required.size(); index++)
482 {
483 if (required[index].find("URL ") == 0)
484 {
485 value += find(required[index].substr(4), lowerDescription,
486 occurrencesDescription);
487 }
488 else if (required[index].find("TITLE ") == 0)
489 {
490 value += find(required[index].substr(6), lowerDescription,
491 occurrencesDescription);
492 }
493 else if (required[index].find("TEXT ") == 0)
494 {
495 value += find(required[index].substr(5), lowerDescription,
496 occurrencesDescription);
497 }
498 else
499 {
500 value += find(required[index], lowerDescription,
501 occurrencesDescription);
502 }
503 }
504
505 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
506 {
507 vector<string> words;
508
509 unsigned begin = 0, found;
510 do
511 {
512 found = eitherOr[index1].find(" OR ", begin);
513
514 if (found != string::npos)
515 {
516 words.push_back(eitherOr[index1].substr(begin, found -
517 begin));
518 }
519 else
520 {
521 words.push_back(eitherOr[index1].substr(begin));
522 }
523
524 begin = found + 4;
525 }
526 while (begin < eitherOr[index1].length() && found !=
527 string::npos);
528
529 for (unsigned number = 0; number < words.size(); number++)
530 {
531 if (words[index1].find("URL ") == 0)
532 {
533 value += find(words[index1].substr(4),
534 lowerDescription, occurrencesDescription);
535 }
536 else if (words[index1].find("TITLE ") == 0)
537 {
538 value += find(words[index1].substr(6),
539 lowerDescription, occurrencesDescription);
540 }
541 else if (words[index1].find("TEXT ") == 0)
542 {
543 value += find(words[index1].substr(5),
544 lowerDescription, occurrencesDescription);
545 }
546 else
547 {
548 value += find(words[index1], lowerDescription,
549 occurrencesDescription);
550 }
551 }
552 }
553
554 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
555 {
556 string lowerHeading = string(getHeadings()[index2].length(),
557 ' ');
558 for (unsigned number = 0; number <
559 getHeadings()[index2].length(); number++)
560 {
561 lowerHeading[number] = tolower(
562 getHeadings()[index2][number]);
563 }
564
565 for (unsigned number0 = 0; number0 < required.size(); number0++)
566 {
567 if (required[number0].find("URL ") == 0)
568 {
569 value += find(required[number0].substr(4),
570 lowerHeading);
571 }
572 else if (required[number0].find("TITLE ") == 0)
573 {
574 value += find(required[number0].substr(6),
575 lowerHeading);
576 }
577 else if (required[number0].find("TEXT ") == 0)
578 {
579 value += find(required[number0].substr(5),
580 lowerHeading);
581 }
582 else
583 {
584 value += find(required[number0], lowerHeading);
585 }
586 }
587
588 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
589 {
590 vector<string> words;
591
592 unsigned begin = 0, found;
593 do
594 {
595 found = eitherOr[number1].find(" OR ", begin);
596
597 if (found != string::npos)
598 {
599 words.push_back(eitherOr[number1].substr(begin,
600 found - begin));
601 }
602 else
603 {
604 words.push_back(eitherOr[number1].substr(begin));
605 }
606
607 begin = found + 4;
608 }
609 while (begin < eitherOr[number1].length() && found !=
610 string::npos);
611
612 for (unsigned number = 0; number < words.size(); number++)
613 {
614 if (words[number].find("URL ") == 0)
615 {
616 value += find(words[number].substr(4),
617 lowerHeading);
618 }
619 else if (words[number].find("TITLE ") == 0)
620 {
621 value += find(words[number].substr(6),
622 lowerHeading);
623 }
624 else if (words[number].find("TEXT ") == 0)
625 {
626 value += find(words[number].substr(5),
627 lowerHeading);
628 }
629 else
630 {
631 value += find(words[number], lowerHeading);
632 }
633 }
634 }
635 }
636 }
637 }
638 }
639
640 void Ranker::checkRequired()
641 {
642 vector<unsigned> inURLs, inTitles, inTexts;
643
644 for (unsigned index = 0; index < required.size(); index++)
645 {
646 unsigned inURL = 0, inTitle = 0, inText = 0;
647
648 if (required[index].find("URL ") == 0)
649 {
650 inURL = find(required[index].substr(4), lowerURL.substr(7));
651
652 if (inURL)
653 {
654 inTitle = find(required[index].substr(4), lowerTitle,
655 occurrencesTitle);
656 inText = find(required[index].substr(4), lowerText,
657 occurrencesText);
658
659 if (!inTitle) inTitle++;
660 if (!inText) inText++;
661 }
662 }
663 else if (required[index].find("TITLE ") == 0)
664 {
665 inTitle = find(required[index].substr(6), lowerTitle,
666 occurrencesTitle);
667
668 if (inTitle)
669 {
670 inURL = find(required[index].substr(6), lowerURL.substr(7));
671 inText = find(required[index].substr(6), lowerText,
672 occurrencesText);
673
674 if (!inURL) inURL++;
675 if (!inText) inText++;
676 }
677 }
678 else if (required[index].find("TEXT ") == 0)
679 {
680 inText = find(required[index].substr(5), lowerText,
681 occurrencesText);
682
683 if (inText)
684 {
685 inURL = find(required[index].substr(5), lowerURL.substr(7));
686 inTitle = find(required[index].substr(5), lowerTitle,
687 occurrencesTitle);
688
689 if (!inURL) inURL++;
690 if (!inTitle) inTitle++;
691 }
692 }
693 else
694 {
695 inURL = find(required[index], lowerURL.substr(7));
696 inTitle = find(required[index], lowerTitle, occurrencesTitle);
697 inText = find(required[index], lowerText, occurrencesText);
698 }
699
700 inURLs.push_back(inURL);
701 inTitles.push_back(inTitle);
702 inTexts.push_back(inText);
703 }
704
705 unsigned inURL = evaluate(inURLs);
706 unsigned inTitle = evaluate(inTitles);
707 unsigned inText = evaluate(inTexts);
708
709 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
710 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
711 inText : 0;
712 }
713
714 void Ranker::checkExcluded()
715 {
716 vector<unsigned> inURLs, inTitles, inTexts;
717
718 for (unsigned index = 0; index < excluded.size(); index++)
719 {
720 unsigned inURL = 0, inTitle = 0, inText = 0;
721
722 inURL = find(excluded[index], lowerURL.substr(7));
723 inTitle = find(excluded[index], lowerTitle);
724 inText = find(excluded[index], lowerText);
725
726 inURLs.push_back(inURL);
727 inTitles.push_back(inTitle);
728 inTexts.push_back(inText);
729 }
730
731 unsigned inURL = evaluate(inURLs);
732 unsigned inTitle = evaluate(inTitles);
733 unsigned inText = evaluate(inTexts);
734
735 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
736 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
737 inText : 0;
738 }
739
740 void Ranker::checkEitherOr()
741 {
742 vector<unsigned> inURLs, inTitles, inTexts;
743
744 for (unsigned index = 0; index < eitherOr.size(); index++)
745 {
746 vector<unsigned> inURLz, inTitlez, inTextz;
747 unsigned inURL = 0, inTitle = 0, inText = 0;
748 vector<string> words;
749
750 unsigned begin = 0, found;
751 do
752 {
753 found = eitherOr[index].find(" OR ", begin);
754
755 if (found != string::npos)
756 {
757 words.push_back(eitherOr[index].substr(begin, found - begin));
758 }
759 else
760 {
761 words.push_back(eitherOr[index].substr(begin));
762 }
763
764 begin = found + 4;
765 }
766 while (begin < eitherOr[index].length() && found != string::npos);
767
768 for (unsigned number = 0; number < words.size(); number++)
769 {
770 unsigned inURL = 0, inTitle = 0, inText = 0;
771
772 if (words[number].find("URL ") == 0)
773 {
774 inURL = find(words[number].substr(4), lowerURL.substr(7));
775
776 if (inURL)
777 {
778 inTitle = find(words[number].substr(4), lowerTitle,
779 occurrencesTitle);
780 inText = find(words[number].substr(4), lowerText,
781 occurrencesText);
782
783 if (!inTitle) inTitle++;
784 if (!inText) inText++;
785 }
786 }
787 else if (words[number].find("TITLE ") == 0)
788 {
789 inTitle = find(words[number].substr(6), lowerTitle,
790 occurrencesTitle);
791
792 if (inTitle)
793 {
794 inURL = find(words[number].substr(6), lowerURL.substr(7));
795 inText = find(words[number].substr(6), lowerText,
796 occurrencesText);
797
798 if (!inURL) inURL++;
799 if (!inText) inText++;
800 }
801 }
802 else if (words[number].find("TEXT ") == 0)
803 {
804 inText = find(words[number].substr(5), lowerText,
805 occurrencesText);
806
807 if (inText)
808 {
809 inURL = find(words[number].substr(5), lowerURL.substr(7));
810 inTitle = find(words[number].substr(5), lowerTitle,
811 occurrencesTitle);
812
813 if (!inURL) inURL++;
814 if (!inTitle) inTitle++;
815 }
816 }
817 else
818 {
819 inURL = find(words[number], lowerURL.substr(7));
820 inTitle = find(words[number], lowerTitle, occurrencesTitle);
821 inText = find(words[number], lowerText, occurrencesText);
822 }
823
824 inURLz.push_back(inURL);
825 inTitlez.push_back(inTitle);
826 inTextz.push_back(inText);
827 }
828
829 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
830 {
831 inURL += inURLz[number0];
832 }
833
834 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
835 {
836 inTitle += inTitlez[number1];
837 }
838
839 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
840 {
841 inText += inTextz[number2];
842 }
843
844 inURLs.push_back(inURL);
845 inTitles.push_back(inTitle);
846 inTexts.push_back(inText);
847
848 inURLz.clear();
849 inTitlez.clear();
850 inTextz.clear();
851 words.clear();
852 }
853
854 unsigned inURL = evaluate(inURLs);
855 unsigned inTitle = evaluate(inTitles);
856 unsigned inText = evaluate(inTexts);
857
858 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
859 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
860 inText : 0;
861 }
862
863 unsigned Ranker::find(string word, const string& where)
864 {
865 unsigned value = 0;
866
867 decrap(word);
868
869 if (word == "")
870 {
871 // this can happen if a word is all crap characters
872 value++;
873 }
874 else if (word.find_first_of(" \n ") == string::npos)
875 {
876 unsigned begin = 0, found;
877 do
878 {
879 found = where.find(word, begin);
880
881 if (found != string::npos)
882 {
883 bool isBefore, isAfter, before = false, after = false;
884 isBefore = found - 1 > 0;
885 isAfter = found + word.length() < where.length();
886
887 if (isBefore) before = isalnum(where[found - 1]) != 0;
888 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
889
890 if (!before && !after)
891 {
892 value++;
893 }
894 }
895
896 begin = found + word.length();
897 }
898 while (found != string::npos && begin < where.length());
899 }
900 else
901 {
902 value = phrase(word, where);
903 }
904
905 return value;
906 }
907
908 unsigned Ranker::find(string word, const string& where, map<unsigned,
909 unsigned>& occurrences)
910 {
911 unsigned value = 0;
912
913 decrap(word);
914
915 if (word == "")
916 {
917 // this can happen if a word is all crap characters
918 value++;
919 }
920 else if (word.find_first_of(" \n ") == string::npos)
921 {
922 unsigned begin = 0, found;
923 do
924 {
925 found = where.find(word, begin);
926
927 if (found != string::npos)
928 {
929 bool isBefore, isAfter, before = false, after = false;
930 isBefore = found - 1 > 0;
931 isAfter = found + word.length() < where.length();
932
933 if (isBefore) before = isalnum(where[found - 1]) != 0;
934 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
935
936 if (!before && !after)
937 {
938 value++;
939
940 occurrences.insert(pair<unsigned, unsigned>(found,
941 word.length()));
942 }
943 }
944
945 begin = found + word.length();
946 }
947 while (found != string::npos && begin < where.length());
948 }
949 else
950 {
951 value = phrase(word, where, occurrences);
952 }
953
954 return value;
955 }
956
957 unsigned Ranker::phrase(const string& phrase, const string& where)
958 {
959 unsigned value = 0;
960 vector<string> words;
961
962 unsigned begin = 0, space;
963 do
964 {
965 space = phrase.find(' ', begin);
966
967 words.push_back(phrase.substr(begin, space - begin));
968
969 begin = space + 1;
970 }
971 while (space != string::npos && begin < phrase.length());
972
973 begin = 0;
974 unsigned counter = 0;
975 do
976 {
977 value += this->phrase(words, 0, begin, true, where);
978 }
979 while (begin < where.length());
980
981 return value;
982 }
983
984 unsigned Ranker::phrase(const string& phrase, const string& where,
985 map<unsigned, unsigned>& occurrences)
986 {
987 unsigned value = 0;
988 vector<string> words;
989
990 unsigned begin = 0, space;
991 do
992 {
993 space = phrase.find(' ', begin);
994
995 words.push_back(phrase.substr(begin, space - begin));
996
997 begin = space + 1;
998 }
999 while (space != string::npos && begin < phrase.length());
1000
1001 begin = 0;
1002 do
1003 {
1004 value += this->phrase(words, 0, begin, true, where, occurrences);
1005 }
1006 while (begin < where.length());
1007
1008 return value;
1009 }
1010
1011 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1012 begin, bool start, const string& where)
1013 {
1014 unsigned value = 0;
1015 bool end = !(word + 1 < words.size());
1016 unsigned found = where.find(words[word], begin);
1017 unsigned newBegin = found + words[word].length();
1018
1019 if (found != string::npos)
1020 {
1021 bool isBefore, isAfter, before = false, after = false;
1022 isBefore = found - 1 > 0;
1023 isAfter = found + words[word].length() < where.length();
1024
1025 if (isBefore) before = isalnum(where[found - 1]) != 0;
1026 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1027
1028 if (!before && !after)
1029 {
1030 bool between = true;
1031 if (!start)
1032 {
1033 for (unsigned index = begin + 1; index < found - 1; index++)
1034 {
1035 if (isalnum(where[index]))
1036 {
1037 between = false;
1038 break;
1039 }
1040 }
1041 }
1042
1043 if (between)
1044 {
1045 if (end)
1046 {
1047 begin = newBegin;
1048 value = 1;
1049 }
1050 else
1051 {
1052 value = phrase(words, (word + 1), newBegin, false, where);
1053 }
1054 }
1055 }
1056 }
1057
1058 if (start)
1059 {
1060 if (found != string::npos)
1061 {
1062 begin = newBegin;
1063 }
1064 else
1065 {
1066 begin = string::npos;
1067 }
1068 }
1069
1070 return value;
1071 }
1072
1073 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1074 begin, bool start, const string& where, map<unsigned, unsigned>&
1075 occurrences)
1076 {
1077 unsigned value = 0;
1078 bool end = !(word + 1 < words.size());
1079 unsigned found = where.find(words[word], begin);
1080 unsigned newBegin = found + words[word].length();
1081
1082 if (found != string::npos)
1083 {
1084 bool isBefore, isAfter, before = false, after = false;
1085 isBefore = found - 1 > 0;
1086 isAfter = found + words[word].length() < where.length();
1087
1088 if (isBefore) before = isalnum(where[found - 1]) != 0;
1089 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1090
1091 if (!before && !after)
1092 {
1093 bool between = true;
1094 if (!start)
1095 {
1096 for (unsigned index = begin + 1; index < found - 1; index++)
1097 {
1098 if (isalnum(where[index]))
1099 {
1100 between = false;
1101 break;
1102 }
1103 }
1104 }
1105
1106 if (between)
1107 {
1108 occurrences.insert(pair<unsigned, unsigned>(found,
1109 words[word].length()));
1110
1111 if (end)
1112 {
1113 begin = newBegin;
1114 value = 1;
1115 }
1116 else
1117 {
1118 value = phrase(words, (word + 1), newBegin, false, where,
1119 occurrences);
1120 }
1121 }
1122 }
1123 }
1124
1125 if (start)
1126 {
1127 if (found != string::npos)
1128 {
1129 begin = newBegin;
1130 }
1131 else
1132 {
1133 begin = string::npos;
1134 }
1135 }
1136
1137 return value;
1138 }
1139
1140 unsigned Ranker::evaluate(vector<unsigned>& ins)
1141 {
1142 unsigned in = 0;
1143
1144 for (unsigned index = 0; index < ins.size(); index++)
1145 {
1146 if (ins[index] > 0)
1147 {
1148 in += ins[index];
1149 }
1150 else
1151 {
1152 in = 0;
1153 break;
1154 }
1155 }
1156
1157 return in;
1158 }
1159
1160 void Ranker::decrap(string& crap)
1161 {
1162 unsigned begin = 0, found;
1163 do
1164 {
1165 // &, _, +, and # are not considered crap
1166 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1167
1168 if (found != string::npos)
1169 {
1170 crap[found] = ' ';
1171 }
1172
1173 begin = found + 1;
1174 }
1175 while (found != string::npos && begin < crap.length());
1176
1177 normalize(crap);
1178 }

Properties

Name Value
svn:eol-style native
svn:keywords Id