ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 340
Committed: 2004-04-16T14:57:46-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 26851 byte(s)
Log Message:
Enum!

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Ranker.hpp"
52
53 Ranker::Ranker(Page& page) : Page(page)
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 void Ranker::rank(vector<string> query)
63 {
64 vector<string> prep;
65
66 for (unsigned index = 0; index < query.size(); index++)
67 {
68 if (query[index] == "allintitle:" && index == 0)
69 {
70 allIn = title;
71 }
72 else if (query[index] == "allinurl:" && index == 0)
73 {
74 allIn = url;
75 }
76 else if (query[index] == "allintext:" && index == 0)
77 {
78 allIn = text;
79 }
80 else if (query[index].find("site:") == 0 && query[index].size() > 5)
81 {
82 site = query[index].substr(5);
83 }
84 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
85 {
86 prep.push_back("TITLE " + query[index].substr(8));
87 }
88 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
89 {
90 prep.push_back("URL " + query[index].substr(6));
91 }
92 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
93 {
94 prep.push_back("TEXT " + query[index].substr(7));
95 }
96 else
97 {
98 prep.push_back(query[index]);
99 }
100 }
101
102 if (prep.size() > 0)
103 {
104 bool or_ = false;
105 for (unsigned index = 0; index < prep.size(); index++)
106 {
107 bool exclude = false;
108 if (prep[index].find('+') == 0)
109 {
110 prep[index].erase(0, 1);
111 }
112 else if (prep[index].find('-') == 0)
113 {
114 exclude = true;
115 prep[index].erase(0, 1);
116 }
117
118 if (or_)
119 {
120 if (prep[index].find(" OR") == string::npos)
121 {
122 or_ = false;
123 }
124
125 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
126 }
127 else if (exclude)
128 {
129 excluded.push_back(prep[index]);
130 }
131 else if (prep[index].find(" OR") != string::npos)
132 {
133 or_ = true;
134 eitherOr.push_back(prep[index]);
135 }
136 else
137 {
138 required.push_back(prep[index]);
139 }
140 }
141 }
142
143 rank();
144 }
145
146 void Ranker::setSample()
147 {
148 map<unsigned, unsigned>::iterator itor;
149
150 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
151
152 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
153 {
154 unsigned distance;
155
156 if (++itor != occurrencesText.end())
157 {
158 unsigned next = itor->first;
159 itor--;
160
161 distance = next - (itor->first + itor->second);
162 }
163 else
164 {
165 distance = string::npos;
166 itor--;
167 }
168
169 distances.insert(pair<unsigned, map<unsigned,
170 unsigned>::iterator>(distance, itor));
171 }
172
173 if (distances.begin() != distances.end())
174 {
175 itor = distances.begin()->second;
176 }
177
178 string portion;
179 unsigned sampleLength = 0, begin = 0, end = string::npos;
180 while (sampleLength < sampleMax && itor != occurrencesText.end())
181 {
182 unsigned found = itor->first;
183 unsigned length = itor->second;
184
185 for (unsigned index = found; index > begin; index--)
186 {
187 if (index == begin) cerr << "Oh crap, I'm insane!\n";
188 if (found - index >= sampleMax - sampleLength - length)
189 {
190 for (; index < found; index++)
191 {
192 if (isspace(getText()[index])) break;
193 }
194 begin = index + 1;
195 break;
196 }
197 else if ((index > begin ? (isupper(getText()[index]) &&
198 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
199 index != found)
200 {
201 begin = index;
202 break;
203 }
204 }
205
206 if (end + 1 != begin) sample += " <strong>...</strong> ";
207
208 portion = getText().substr(begin, found - begin);
209 sampleLength += portion.length();
210
211 entities(portion, '&', "&amp;");
212 entities(portion, '\"', "&quot;");
213 entities(portion, '<', "&lt;");
214 entities(portion, '>', "&gt;");
215
216 sample += portion + "<strong>";
217
218 portion = getText().substr(found, length);
219 sampleLength += portion.length();
220
221 entities(portion, '&', "&amp;");
222 entities(portion, '\"', "&quot;");
223 entities(portion, '<', "&lt;");
224 entities(portion, '>', "&gt;");
225
226 sample += portion + "</strong>";
227
228 begin = found + length;
229 end = begin - 1;
230
231 if (++itor != occurrencesText.end())
232 {
233 if (itor->first + itor->second < begin + sampleMax - sampleLength)
234 {
235 portion = getText().substr(begin, itor->first - begin);
236 sampleLength += portion.length();
237
238 entities(portion, '&', "&amp;");
239 entities(portion, '\"', "&quot;");
240 entities(portion, '<', "&lt;");
241 entities(portion, '>', "&gt;");
242
243 sample += portion;
244
245 begin = itor->first;
246 end = begin - 1;
247 }
248 else
249 {
250 for (end = begin + sampleMax - sampleLength; end > begin;
251 end--)
252 {
253 if (isspace(getText()[end])) break;
254 }
255
256 portion = getText().substr(begin, end - begin + 1);
257 sampleLength += portion.length();
258
259 entities(portion, '&', "&amp;");
260 entities(portion, '\"', "&quot;");
261 entities(portion, '<', "&lt;");
262 entities(portion, '>', "&gt;");
263
264 sample += portion + " <strong>...</strong>";
265
266 break;
267 }
268 }
269 else
270 {
271 for (end = begin + sampleMax - sampleLength; end > begin && (end +
272 1 < getText().length()); end--)
273 {
274 if (isspace(getText()[end])) break;
275 }
276
277 if (end >= getText().length()) end = getText().length() - 1;
278
279 portion = getText().substr(begin, end - begin + 1);
280 sampleLength += portion.length();
281
282 entities(portion, '&', "&amp;");
283 entities(portion, '\"', "&quot;");
284 entities(portion, '<', "&lt;");
285 entities(portion, '>', "&gt;");
286
287 sample += portion;
288
289 if (end + 1 < getText().length())
290 {
291 sample += " <strong>...</strong>";
292 }
293
294 break;
295 }
296 }
297 }
298
299 string Ranker::getTitle()
300 {
301 string title, portion;
302
303 unsigned begin = 0;
304 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
305 itor != occurrencesTitle.end(); itor++)
306 {
307 unsigned found = itor->first;
308 unsigned length = itor->second;
309
310 portion = Page::getTitle().substr(begin, found - begin);
311
312 entities(portion, '&', "&amp;");
313 entities(portion, '\"', "&quot;");
314 entities(portion, '<', "&lt;");
315 entities(portion, '>', "&gt;");
316
317 title += portion + "<strong>";
318
319 portion = Page::getTitle().substr(found, length);
320
321 entities(portion, '&', "&amp;");
322 entities(portion, '\"', "&quot;");
323 entities(portion, '<', "&lt;");
324 entities(portion, '>', "&gt;");
325
326 title += portion + "</strong>";
327
328 begin = found + length;
329 }
330
331 portion = Page::getTitle().substr(begin);
332
333 entities(portion, '&', "&amp;");
334 entities(portion, '\"', "&quot;");
335 entities(portion, '<', "&lt;");
336 entities(portion, '>', "&gt;");
337
338 title += portion;
339
340 return title;
341 }
342
343 string Ranker::getDescription()
344 {
345 string description, portion;
346
347 unsigned begin = 0;
348 for (map<unsigned, unsigned>::iterator itor =
349 occurrencesDescription.begin(); itor != occurrencesDescription.end();
350 itor++)
351 {
352 unsigned found = itor->first;
353 unsigned length = itor->second;
354
355 portion = Page::getDescription().substr(begin, found - begin);
356
357 entities(portion, '&', "&amp;");
358 entities(portion, '\"', "&quot;");
359 entities(portion, '<', "&lt;");
360 entities(portion, '>', "&gt;");
361
362 description += portion + "<strong>";
363
364 portion = Page::getDescription().substr(found, length);
365
366 entities(portion, '&', "&amp;");
367 entities(portion, '\"', "&quot;");
368 entities(portion, '<', "&lt;");
369 entities(portion, '>', "&gt;");
370
371 description += portion + "</strong>";
372
373 begin = found + length;
374 }
375
376 portion = Page::getDescription().substr(begin);
377
378 entities(portion, '&', "&amp;");
379 entities(portion, '\"', "&quot;");
380 entities(portion, '<', "&lt;");
381 entities(portion, '>', "&gt;");
382
383 description += portion;
384
385 return description;
386 }
387
388 bool Ranker::operator==(const unsigned number) const
389 {
390 return value == number;
391 }
392
393 bool Ranker::operator==(const Ranker& ranker) const
394 {
395 return value == ranker.value;
396 }
397
398 bool Ranker::operator!=(const unsigned number) const
399 {
400 return value != number;
401 }
402
403 bool Ranker::operator!=(const Ranker& ranker) const
404 {
405 return value != ranker.value;
406 }
407
408 bool Ranker::operator<(const unsigned number) const
409 {
410 return value < number;
411 }
412
413 bool Ranker::operator<(const Ranker& ranker) const
414 {
415 return value < ranker.value;
416 }
417
418 bool Ranker::operator>(const unsigned number) const
419 {
420 return value > number;
421 }
422
423 bool Ranker::operator >(const Ranker& ranker) const
424 {
425 return value > ranker.value;
426 }
427
428 void Ranker::rank()
429 {
430 lowerAddress = tolower(getAddress());
431
432 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
433 site.length())
434 {
435 bool isRequired = required.size() > 0;
436 bool isExcluded = excluded.size() > 0;
437 bool isEitherOr = eitherOr.size() > 0;
438
439 lowerURL = tolower(getURL());
440 lowerTitle = tolower(Page::getTitle());
441 lowerText = tolower(Page::getText());
442
443 if (isRequired) checkRequired();
444 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
445 if (isEitherOr) checkEitherOr();
446
447 if (isRequired && isExcluded && isEitherOr)
448 {
449 value += requiredValue && !excludedValue && eitherOrValue ?
450 requiredValue + eitherOrValue : 0;
451 }
452 else if (isRequired && isExcluded)
453 {
454 value += requiredValue && !excludedValue ? requiredValue : 0;
455 }
456 else if (isRequired && isEitherOr)
457 {
458 value += requiredValue && eitherOrValue ? requiredValue +
459 eitherOrValue : 0;
460 }
461 else if (isExcluded && isEitherOr)
462 {
463 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
464 }
465 else if (isRequired)
466 {
467 value += requiredValue;
468 }
469 else if (isEitherOr)
470 {
471 value += eitherOrValue;
472 }
473 else
474 {
475 // do nothing this is a bad search and warrants no results
476 }
477
478 if (value > 0)
479 {
480 string lowerDescription = tolower(Page::getDescription());
481
482 for (unsigned index = 0; index < required.size(); index++)
483 {
484 if (required[index].find("URL ") == 0)
485 {
486 value += find(required[index].substr(4), lowerDescription,
487 occurrencesDescription);
488 }
489 else if (required[index].find("TITLE ") == 0)
490 {
491 value += find(required[index].substr(6), lowerDescription,
492 occurrencesDescription);
493 }
494 else if (required[index].find("TEXT ") == 0)
495 {
496 value += find(required[index].substr(5), lowerDescription,
497 occurrencesDescription);
498 }
499 else
500 {
501 value += find(required[index], lowerDescription,
502 occurrencesDescription);
503 }
504 }
505
506 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
507 {
508 vector<string> words;
509
510 unsigned begin = 0, found;
511 do
512 {
513 found = eitherOr[index1].find(" OR ", begin);
514
515 if (found != string::npos)
516 {
517 words.push_back(eitherOr[index1].substr(begin, found -
518 begin));
519 }
520 else
521 {
522 words.push_back(eitherOr[index1].substr(begin));
523 }
524
525 begin = found + 4;
526 }
527 while (begin < eitherOr[index1].length() && found !=
528 string::npos);
529
530 for (unsigned number = 0; number < words.size(); number++)
531 {
532 if (words[index1].find("URL ") == 0)
533 {
534 value += find(words[index1].substr(4),
535 lowerDescription, occurrencesDescription);
536 }
537 else if (words[index1].find("TITLE ") == 0)
538 {
539 value += find(words[index1].substr(6),
540 lowerDescription, occurrencesDescription);
541 }
542 else if (words[index1].find("TEXT ") == 0)
543 {
544 value += find(words[index1].substr(5),
545 lowerDescription, occurrencesDescription);
546 }
547 else
548 {
549 value += find(words[index1], lowerDescription,
550 occurrencesDescription);
551 }
552 }
553 }
554
555 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
556 {
557 string lowerHeading = string(getHeadings()[index2].length(),
558 ' ');
559 for (unsigned number = 0; number <
560 getHeadings()[index2].length(); number++)
561 {
562 lowerHeading[number] = tolower(
563 getHeadings()[index2][number]);
564 }
565
566 for (unsigned number0 = 0; number0 < required.size(); number0++)
567 {
568 if (required[number0].find("URL ") == 0)
569 {
570 value += find(required[number0].substr(4),
571 lowerHeading);
572 }
573 else if (required[number0].find("TITLE ") == 0)
574 {
575 value += find(required[number0].substr(6),
576 lowerHeading);
577 }
578 else if (required[number0].find("TEXT ") == 0)
579 {
580 value += find(required[number0].substr(5),
581 lowerHeading);
582 }
583 else
584 {
585 value += find(required[number0], lowerHeading);
586 }
587 }
588
589 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
590 {
591 vector<string> words;
592
593 unsigned begin = 0, found;
594 do
595 {
596 found = eitherOr[number1].find(" OR ", begin);
597
598 if (found != string::npos)
599 {
600 words.push_back(eitherOr[number1].substr(begin,
601 found - begin));
602 }
603 else
604 {
605 words.push_back(eitherOr[number1].substr(begin));
606 }
607
608 begin = found + 4;
609 }
610 while (begin < eitherOr[number1].length() && found !=
611 string::npos);
612
613 for (unsigned number = 0; number < words.size(); number++)
614 {
615 if (words[number].find("URL ") == 0)
616 {
617 value += find(words[number].substr(4),
618 lowerHeading);
619 }
620 else if (words[number].find("TITLE ") == 0)
621 {
622 value += find(words[number].substr(6),
623 lowerHeading);
624 }
625 else if (words[number].find("TEXT ") == 0)
626 {
627 value += find(words[number].substr(5),
628 lowerHeading);
629 }
630 else
631 {
632 value += find(words[number], lowerHeading);
633 }
634 }
635 }
636 }
637 }
638 }
639 }
640
641 void Ranker::checkRequired()
642 {
643 vector<unsigned> inURLs, inTitles, inTexts;
644
645 for (unsigned index = 0; index < required.size(); index++)
646 {
647 unsigned inURL = 0, inTitle = 0, inText = 0;
648
649 if (required[index].find("URL ") == 0)
650 {
651 inURL = find(required[index].substr(4), lowerURL.substr(7));
652
653 if (inURL)
654 {
655 inTitle = find(required[index].substr(4), lowerTitle,
656 occurrencesTitle);
657 inText = find(required[index].substr(4), lowerText,
658 occurrencesText);
659
660 if (!inTitle) inTitle++;
661 if (!inText) inText++;
662 }
663 }
664 else if (required[index].find("TITLE ") == 0)
665 {
666 inTitle = find(required[index].substr(6), lowerTitle,
667 occurrencesTitle);
668
669 if (inTitle)
670 {
671 inURL = find(required[index].substr(6), lowerURL.substr(7));
672 inText = find(required[index].substr(6), lowerText,
673 occurrencesText);
674
675 if (!inURL) inURL++;
676 if (!inText) inText++;
677 }
678 }
679 else if (required[index].find("TEXT ") == 0)
680 {
681 inText = find(required[index].substr(5), lowerText,
682 occurrencesText);
683
684 if (inText)
685 {
686 inURL = find(required[index].substr(5), lowerURL.substr(7));
687 inTitle = find(required[index].substr(5), lowerTitle,
688 occurrencesTitle);
689
690 if (!inURL) inURL++;
691 if (!inTitle) inTitle++;
692 }
693 }
694 else
695 {
696 inURL = find(required[index], lowerURL.substr(7));
697 inTitle = find(required[index], lowerTitle, occurrencesTitle);
698 inText = find(required[index], lowerText, occurrencesText);
699 }
700
701 inURLs.push_back(inURL);
702 inTitles.push_back(inTitle);
703 inTexts.push_back(inText);
704 }
705
706 unsigned inURL = evaluate(inURLs);
707 unsigned inTitle = evaluate(inTitles);
708 unsigned inText = evaluate(inTexts);
709
710 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
711 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
712 inText : 0;
713 }
714
715 void Ranker::checkExcluded()
716 {
717 vector<unsigned> inURLs, inTitles, inTexts;
718
719 for (unsigned index = 0; index < excluded.size(); index++)
720 {
721 unsigned inURL = 0, inTitle = 0, inText = 0;
722
723 inURL = find(excluded[index], lowerURL.substr(7));
724 inTitle = find(excluded[index], lowerTitle);
725 inText = find(excluded[index], lowerText);
726
727 inURLs.push_back(inURL);
728 inTitles.push_back(inTitle);
729 inTexts.push_back(inText);
730 }
731
732 unsigned inURL = evaluate(inURLs);
733 unsigned inTitle = evaluate(inTitles);
734 unsigned inText = evaluate(inTexts);
735
736 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
737 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
738 inText : 0;
739 }
740
741 void Ranker::checkEitherOr()
742 {
743 vector<unsigned> inURLs, inTitles, inTexts;
744
745 for (unsigned index = 0; index < eitherOr.size(); index++)
746 {
747 vector<unsigned> inURLz, inTitlez, inTextz;
748 unsigned inURL = 0, inTitle = 0, inText = 0;
749 vector<string> words;
750
751 unsigned begin = 0, found;
752 do
753 {
754 found = eitherOr[index].find(" OR ", begin);
755
756 if (found != string::npos)
757 {
758 words.push_back(eitherOr[index].substr(begin, found - begin));
759 }
760 else
761 {
762 words.push_back(eitherOr[index].substr(begin));
763 }
764
765 begin = found + 4;
766 }
767 while (begin < eitherOr[index].length() && found != string::npos);
768
769 for (unsigned number = 0; number < words.size(); number++)
770 {
771 unsigned inURL = 0, inTitle = 0, inText = 0;
772
773 if (words[number].find("URL ") == 0)
774 {
775 inURL = find(words[number].substr(4), lowerURL.substr(7));
776
777 if (inURL)
778 {
779 inTitle = find(words[number].substr(4), lowerTitle,
780 occurrencesTitle);
781 inText = find(words[number].substr(4), lowerText,
782 occurrencesText);
783
784 if (!inTitle) inTitle++;
785 if (!inText) inText++;
786 }
787 }
788 else if (words[number].find("TITLE ") == 0)
789 {
790 inTitle = find(words[number].substr(6), lowerTitle,
791 occurrencesTitle);
792
793 if (inTitle)
794 {
795 inURL = find(words[number].substr(6), lowerURL.substr(7));
796 inText = find(words[number].substr(6), lowerText,
797 occurrencesText);
798
799 if (!inURL) inURL++;
800 if (!inText) inText++;
801 }
802 }
803 else if (words[number].find("TEXT ") == 0)
804 {
805 inText = find(words[number].substr(5), lowerText,
806 occurrencesText);
807
808 if (inText)
809 {
810 inURL = find(words[number].substr(5), lowerURL.substr(7));
811 inTitle = find(words[number].substr(5), lowerTitle,
812 occurrencesTitle);
813
814 if (!inURL) inURL++;
815 if (!inTitle) inTitle++;
816 }
817 }
818 else
819 {
820 inURL = find(words[number], lowerURL.substr(7));
821 inTitle = find(words[number], lowerTitle, occurrencesTitle);
822 inText = find(words[number], lowerText, occurrencesText);
823 }
824
825 inURLz.push_back(inURL);
826 inTitlez.push_back(inTitle);
827 inTextz.push_back(inText);
828 }
829
830 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
831 {
832 inURL += inURLz[number0];
833 }
834
835 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
836 {
837 inTitle += inTitlez[number1];
838 }
839
840 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
841 {
842 inText += inTextz[number2];
843 }
844
845 inURLs.push_back(inURL);
846 inTitles.push_back(inTitle);
847 inTexts.push_back(inText);
848
849 inURLz.clear();
850 inTitlez.clear();
851 inTextz.clear();
852 words.clear();
853 }
854
855 unsigned inURL = evaluate(inURLs);
856 unsigned inTitle = evaluate(inTitles);
857 unsigned inText = evaluate(inTexts);
858
859 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
860 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
861 inText : 0;
862 }
863
864 unsigned Ranker::find(string word, const string& where)
865 {
866 unsigned value = 0;
867
868 decrap(word);
869
870 if (word == "")
871 {
872 // this can happen if a word is all crap characters
873 value++;
874 }
875 else if (word.find_first_of(" \n ") == string::npos)
876 {
877 unsigned begin = 0, found;
878 do
879 {
880 found = where.find(word, begin);
881
882 if (found != string::npos)
883 {
884 bool isBefore, isAfter, before = false, after = false;
885 isBefore = found - 1 > 0;
886 isAfter = found + word.length() < where.length();
887
888 if (isBefore) before = isalnum(where[found - 1]) != 0;
889 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
890
891 if (!before && !after)
892 {
893 value++;
894 }
895 }
896
897 begin = found + word.length();
898 }
899 while (found != string::npos && begin < where.length());
900 }
901 else
902 {
903 value = phrase(word, where);
904 }
905
906 return value;
907 }
908
909 unsigned Ranker::find(string word, const string& where, map<unsigned,
910 unsigned>& occurrences)
911 {
912 unsigned value = 0;
913
914 decrap(word);
915
916 if (word == "")
917 {
918 // this can happen if a word is all crap characters
919 value++;
920 }
921 else if (word.find_first_of(" \n ") == string::npos)
922 {
923 unsigned begin = 0, found;
924 do
925 {
926 found = where.find(word, begin);
927
928 if (found != string::npos)
929 {
930 bool isBefore, isAfter, before = false, after = false;
931 isBefore = found - 1 > 0;
932 isAfter = found + word.length() < where.length();
933
934 if (isBefore) before = isalnum(where[found - 1]) != 0;
935 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
936
937 if (!before && !after)
938 {
939 value++;
940
941 occurrences.insert(pair<unsigned, unsigned>(found,
942 word.length()));
943 }
944 }
945
946 begin = found + word.length();
947 }
948 while (found != string::npos && begin < where.length());
949 }
950 else
951 {
952 value = phrase(word, where, occurrences);
953 }
954
955 return value;
956 }
957
958 unsigned Ranker::phrase(const string& phrase, const string& where)
959 {
960 unsigned value = 0;
961 vector<string> words;
962
963 unsigned begin = 0, space;
964 do
965 {
966 space = phrase.find(' ', begin);
967
968 words.push_back(phrase.substr(begin, space - begin));
969
970 begin = space + 1;
971 }
972 while (space != string::npos && begin < phrase.length());
973
974 begin = 0;
975 unsigned counter = 0;
976 do
977 {
978 value += this->phrase(words, 0, begin, true, where);
979 }
980 while (begin < where.length());
981
982 return value;
983 }
984
985 unsigned Ranker::phrase(const string& phrase, const string& where,
986 map<unsigned, unsigned>& occurrences)
987 {
988 unsigned value = 0;
989 vector<string> words;
990
991 unsigned begin = 0, space;
992 do
993 {
994 space = phrase.find(' ', begin);
995
996 words.push_back(phrase.substr(begin, space - begin));
997
998 begin = space + 1;
999 }
1000 while (space != string::npos && begin < phrase.length());
1001
1002 begin = 0;
1003 do
1004 {
1005 value += this->phrase(words, 0, begin, true, where, occurrences);
1006 }
1007 while (begin < where.length());
1008
1009 return value;
1010 }
1011
1012 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1013 begin, bool start, const string& where)
1014 {
1015 unsigned value = 0;
1016 bool end = !(word + 1 < words.size());
1017 unsigned found = where.find(words[word], begin);
1018 unsigned newBegin = found + words[word].length();
1019
1020 if (found != string::npos)
1021 {
1022 bool isBefore, isAfter, before = false, after = false;
1023 isBefore = found - 1 > 0;
1024 isAfter = found + words[word].length() < where.length();
1025
1026 if (isBefore) before = isalnum(where[found - 1]) != 0;
1027 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1028
1029 if (!before && !after)
1030 {
1031 bool between = true;
1032 if (!start)
1033 {
1034 for (unsigned index = begin + 1; index < found - 1; index++)
1035 {
1036 if (isalnum(where[index]))
1037 {
1038 between = false;
1039 break;
1040 }
1041 }
1042 }
1043
1044 if (between)
1045 {
1046 if (end)
1047 {
1048 begin = newBegin;
1049 value = 1;
1050 }
1051 else
1052 {
1053 value = phrase(words, (word + 1), newBegin, false, where);
1054 }
1055 }
1056 }
1057 }
1058
1059 if (start)
1060 {
1061 if (found != string::npos)
1062 {
1063 begin = newBegin;
1064 }
1065 else
1066 {
1067 begin = string::npos;
1068 }
1069 }
1070
1071 return value;
1072 }
1073
1074 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1075 begin, bool start, const string& where, map<unsigned, unsigned>&
1076 occurrences)
1077 {
1078 unsigned value = 0;
1079 bool end = !(word + 1 < words.size());
1080 unsigned found = where.find(words[word], begin);
1081 unsigned newBegin = found + words[word].length();
1082
1083 if (found != string::npos)
1084 {
1085 bool isBefore, isAfter, before = false, after = false;
1086 isBefore = found - 1 > 0;
1087 isAfter = found + words[word].length() < where.length();
1088
1089 if (isBefore) before = isalnum(where[found - 1]) != 0;
1090 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1091
1092 if (!before && !after)
1093 {
1094 bool between = true;
1095 if (!start)
1096 {
1097 for (unsigned index = begin + 1; index < found - 1; index++)
1098 {
1099 if (isalnum(where[index]))
1100 {
1101 between = false;
1102 break;
1103 }
1104 }
1105 }
1106
1107 if (between)
1108 {
1109 occurrences.insert(pair<unsigned, unsigned>(found,
1110 words[word].length()));
1111
1112 if (end)
1113 {
1114 begin = newBegin;
1115 value = 1;
1116 }
1117 else
1118 {
1119 value = phrase(words, (word + 1), newBegin, false, where,
1120 occurrences);
1121 }
1122 }
1123 }
1124 }
1125
1126 if (start)
1127 {
1128 if (found != string::npos)
1129 {
1130 begin = newBegin;
1131 }
1132 else
1133 {
1134 begin = string::npos;
1135 }
1136 }
1137
1138 return value;
1139 }
1140
1141 unsigned Ranker::evaluate(vector<unsigned>& ins)
1142 {
1143 unsigned in = 0;
1144
1145 for (unsigned index = 0; index < ins.size(); index++)
1146 {
1147 if (ins[index] > 0)
1148 {
1149 in += ins[index];
1150 }
1151 else
1152 {
1153 in = 0;
1154 break;
1155 }
1156 }
1157
1158 return in;
1159 }
1160
1161 void Ranker::decrap(string& crap)
1162 {
1163 unsigned begin = 0, found;
1164 do
1165 {
1166 // &, _, +, and # are not considered crap
1167 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1168
1169 if (found != string::npos)
1170 {
1171 crap[found] = ' ';
1172 }
1173
1174 begin = found + 1;
1175 }
1176 while (found != string::npos && begin < crap.length());
1177
1178 normalize(crap);
1179 }

Properties

Name Value
svn:eol-style native
svn:keywords Id