ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 342
Committed: 2004-04-20T13:35:05-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 27261 byte(s)
Log Message:
Well almost never, now it puts an elipsis if its empty.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Ranker.hpp"
52
53 Ranker::Ranker(Page& page) : Page(page)
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 void Ranker::rank(vector<string> query)
63 {
64 vector<string> prep;
65
66 for (unsigned index = 0; index < query.size(); index++)
67 {
68 if (query[index] == "allintitle:" && index == 0)
69 {
70 allIn = title;
71 }
72 else if (query[index] == "allinurl:" && index == 0)
73 {
74 allIn = url;
75 }
76 else if (query[index] == "allintext:" && index == 0)
77 {
78 allIn = text;
79 }
80 else if (query[index].find("site:") == 0 && query[index].size() > 5)
81 {
82 site = query[index].substr(5);
83 }
84 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
85 {
86 prep.push_back("TITLE " + query[index].substr(8));
87 }
88 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
89 {
90 prep.push_back("URL " + query[index].substr(6));
91 }
92 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
93 {
94 prep.push_back("TEXT " + query[index].substr(7));
95 }
96 else
97 {
98 prep.push_back(query[index]);
99 }
100 }
101
102 if (prep.size() > 0)
103 {
104 bool or_ = false;
105 for (unsigned index = 0; index < prep.size(); index++)
106 {
107 bool exclude = false;
108 if (prep[index].find('+') == 0)
109 {
110 prep[index].erase(0, 1);
111 }
112 else if (prep[index].find('-') == 0)
113 {
114 exclude = true;
115 prep[index].erase(0, 1);
116 }
117
118 if (or_)
119 {
120 if (prep[index].find(" OR") == string::npos)
121 {
122 or_ = false;
123 }
124
125 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
126 }
127 else if (exclude)
128 {
129 excluded.push_back(prep[index]);
130 }
131 else if (prep[index].find(" OR") != string::npos)
132 {
133 or_ = true;
134 eitherOr.push_back(prep[index]);
135 }
136 else
137 {
138 required.push_back(prep[index]);
139 }
140 }
141 }
142
143 rank();
144 }
145
146 void Ranker::setSample()
147 {
148 map<unsigned, unsigned>::iterator itor;
149 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
150
151 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
152 {
153 unsigned distance;
154
155 if (++itor != occurrencesText.end())
156 {
157 unsigned next = itor->first;
158
159 itor--;
160 distance = next - (itor->first + itor->second);
161 }
162 else
163 {
164 distance = string::npos;
165 itor--;
166 }
167
168 distances.insert(pair<unsigned, map<unsigned,
169 unsigned>::iterator>(distance, itor));
170 }
171
172 if (distances.begin() != distances.end())
173 {
174 itor = distances.begin()->second;
175 }
176
177 string portion;
178 unsigned sampleLength = 0, begin = 0, end = string::npos;
179
180 while (sampleLength < sampleMax && itor != occurrencesText.end())
181 {
182 unsigned found = itor->first, length = itor->second;
183
184 for (unsigned index = found; index > begin; index--)
185 {
186 if (found - index >= sampleMax - sampleLength - length)
187 {
188 for (; index < found; index++)
189 {
190 if (isspace(getText()[index])) break;
191 }
192
193 begin = index + 1;
194
195 break;
196 }
197 else if ((index > begin ? (isupper(getText()[index]) &&
198 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
199 index != found)
200 {
201 begin = index;
202
203 break;
204 }
205 }
206
207 if (end + 1 != begin) sample += " <strong>...</strong> ";
208
209 portion = getText().substr(begin, found - begin);
210 sampleLength += portion.length();
211
212 entities(portion, '&', "&amp;");
213 entities(portion, '\"', "&quot;");
214 entities(portion, '<', "&lt;");
215 entities(portion, '>', "&gt;");
216
217 sample += portion + "<strong>";
218 portion = getText().substr(found, length);
219 sampleLength += portion.length();
220
221 entities(portion, '&', "&amp;");
222 entities(portion, '\"', "&quot;");
223 entities(portion, '<', "&lt;");
224 entities(portion, '>', "&gt;");
225
226 sample += portion + "</strong>";
227 begin = found + length;
228 end = begin - 1;
229
230 if (++itor != occurrencesText.end())
231 {
232 if (itor->first + itor->second < begin + sampleMax - sampleLength)
233 {
234 portion = getText().substr(begin, itor->first - begin);
235 sampleLength += portion.length();
236
237 entities(portion, '&', "&amp;");
238 entities(portion, '\"', "&quot;");
239 entities(portion, '<', "&lt;");
240 entities(portion, '>', "&gt;");
241
242 sample += portion;
243 begin = itor->first;
244 end = begin - 1;
245 }
246 else
247 {
248 for (end = begin + sampleMax - sampleLength; end > begin;
249 end--)
250 {
251 if (isspace(getText()[end])) break;
252 }
253
254 portion = getText().substr(begin, end - begin + 1);
255 sampleLength += portion.length();
256
257 entities(portion, '&', "&amp;");
258 entities(portion, '\"', "&quot;");
259 entities(portion, '<', "&lt;");
260 entities(portion, '>', "&gt;");
261
262 sample += portion + " <strong>...</strong>";
263
264 break;
265 }
266 }
267 else
268 {
269 for (end = begin + sampleMax - sampleLength; end > begin && (end +
270 1 < getText().length()); end--)
271 {
272 if (isspace(getText()[end])) break;
273 }
274
275 if (end >= getText().length()) end = getText().length() - 1;
276
277 portion = getText().substr(begin, end - begin + 1);
278 sampleLength += portion.length();
279
280 entities(portion, '&', "&amp;");
281 entities(portion, '\"', "&quot;");
282 entities(portion, '<', "&lt;");
283 entities(portion, '>', "&gt;");
284
285 sample += portion;
286
287 if (end + 1 < getText().length())
288 {
289 sample += " <strong>...</strong>";
290 }
291
292 break;
293 }
294 }
295
296 if (sample == "")
297 {
298 for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--)
299 {
300 if (isspace(getText()[end])) break;
301 }
302
303 sample = getText().substr(0, end + 1);
304
305 entities(sample, '&', "&amp;");
306 entities(sample, '\"', "&quot;");
307 entities(sample, '<', "&lt;");
308 entities(sample, '>', "&gt;");
309
310 if (end + 1 < getText().length())
311 {
312 sample += " <strong>...</strong>";
313 }
314 else if (sample == "")
315 {
316 sample = "<strong>...</strong>";
317 }
318 }
319 }
320
321 string Ranker::getTitle()
322 {
323 string title, portion;
324
325 unsigned begin = 0;
326 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
327 itor != occurrencesTitle.end(); itor++)
328 {
329 unsigned found = itor->first;
330 unsigned length = itor->second;
331
332 portion = Page::getTitle().substr(begin, found - begin);
333
334 entities(portion, '&', "&amp;");
335 entities(portion, '\"', "&quot;");
336 entities(portion, '<', "&lt;");
337 entities(portion, '>', "&gt;");
338
339 title += portion + "<strong>";
340
341 portion = Page::getTitle().substr(found, length);
342
343 entities(portion, '&', "&amp;");
344 entities(portion, '\"', "&quot;");
345 entities(portion, '<', "&lt;");
346 entities(portion, '>', "&gt;");
347
348 title += portion + "</strong>";
349
350 begin = found + length;
351 }
352
353 portion = Page::getTitle().substr(begin);
354
355 entities(portion, '&', "&amp;");
356 entities(portion, '\"', "&quot;");
357 entities(portion, '<', "&lt;");
358 entities(portion, '>', "&gt;");
359
360 title += portion;
361
362 return title;
363 }
364
365 string Ranker::getDescription()
366 {
367 string description, portion;
368
369 unsigned begin = 0;
370 for (map<unsigned, unsigned>::iterator itor =
371 occurrencesDescription.begin(); itor != occurrencesDescription.end();
372 itor++)
373 {
374 unsigned found = itor->first;
375 unsigned length = itor->second;
376
377 portion = Page::getDescription().substr(begin, found - begin);
378
379 entities(portion, '&', "&amp;");
380 entities(portion, '\"', "&quot;");
381 entities(portion, '<', "&lt;");
382 entities(portion, '>', "&gt;");
383
384 description += portion + "<strong>";
385
386 portion = Page::getDescription().substr(found, length);
387
388 entities(portion, '&', "&amp;");
389 entities(portion, '\"', "&quot;");
390 entities(portion, '<', "&lt;");
391 entities(portion, '>', "&gt;");
392
393 description += portion + "</strong>";
394
395 begin = found + length;
396 }
397
398 portion = Page::getDescription().substr(begin);
399
400 entities(portion, '&', "&amp;");
401 entities(portion, '\"', "&quot;");
402 entities(portion, '<', "&lt;");
403 entities(portion, '>', "&gt;");
404
405 description += portion;
406
407 return description;
408 }
409
410 bool Ranker::operator==(const unsigned number) const
411 {
412 return value == number;
413 }
414
415 bool Ranker::operator==(const Ranker& ranker) const
416 {
417 return value == ranker.value;
418 }
419
420 bool Ranker::operator!=(const unsigned number) const
421 {
422 return value != number;
423 }
424
425 bool Ranker::operator!=(const Ranker& ranker) const
426 {
427 return value != ranker.value;
428 }
429
430 bool Ranker::operator<(const unsigned number) const
431 {
432 return value < number;
433 }
434
435 bool Ranker::operator<(const Ranker& ranker) const
436 {
437 return value < ranker.value;
438 }
439
440 bool Ranker::operator>(const unsigned number) const
441 {
442 return value > number;
443 }
444
445 bool Ranker::operator >(const Ranker& ranker) const
446 {
447 return value > ranker.value;
448 }
449
450 void Ranker::rank()
451 {
452 lowerAddress = tolower(getAddress());
453
454 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
455 site.length())
456 {
457 bool isRequired = required.size() > 0;
458 bool isExcluded = excluded.size() > 0;
459 bool isEitherOr = eitherOr.size() > 0;
460
461 lowerURL = tolower(getURL());
462 lowerTitle = tolower(Page::getTitle());
463 lowerText = tolower(Page::getText());
464
465 if (isRequired) checkRequired();
466 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
467 if (isEitherOr) checkEitherOr();
468
469 if (isRequired && isExcluded && isEitherOr)
470 {
471 value += requiredValue && !excludedValue && eitherOrValue ?
472 requiredValue + eitherOrValue : 0;
473 }
474 else if (isRequired && isExcluded)
475 {
476 value += requiredValue && !excludedValue ? requiredValue : 0;
477 }
478 else if (isRequired && isEitherOr)
479 {
480 value += requiredValue && eitherOrValue ? requiredValue +
481 eitherOrValue : 0;
482 }
483 else if (isExcluded && isEitherOr)
484 {
485 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
486 }
487 else if (isRequired)
488 {
489 value += requiredValue;
490 }
491 else if (isEitherOr)
492 {
493 value += eitherOrValue;
494 }
495 else
496 {
497 // do nothing this is a bad search and warrants no results
498 }
499
500 if (value > 0)
501 {
502 string lowerDescription = tolower(Page::getDescription());
503
504 for (unsigned index = 0; index < required.size(); index++)
505 {
506 if (required[index].find("URL ") == 0)
507 {
508 value += find(required[index].substr(4), lowerDescription,
509 occurrencesDescription);
510 }
511 else if (required[index].find("TITLE ") == 0)
512 {
513 value += find(required[index].substr(6), lowerDescription,
514 occurrencesDescription);
515 }
516 else if (required[index].find("TEXT ") == 0)
517 {
518 value += find(required[index].substr(5), lowerDescription,
519 occurrencesDescription);
520 }
521 else
522 {
523 value += find(required[index], lowerDescription,
524 occurrencesDescription);
525 }
526 }
527
528 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
529 {
530 vector<string> words;
531
532 unsigned begin = 0, found;
533 do
534 {
535 found = eitherOr[index1].find(" OR ", begin);
536
537 if (found != string::npos)
538 {
539 words.push_back(eitherOr[index1].substr(begin, found -
540 begin));
541 }
542 else
543 {
544 words.push_back(eitherOr[index1].substr(begin));
545 }
546
547 begin = found + 4;
548 }
549 while (begin < eitherOr[index1].length() && found !=
550 string::npos);
551
552 for (unsigned number = 0; number < words.size(); number++)
553 {
554 if (words[index1].find("URL ") == 0)
555 {
556 value += find(words[index1].substr(4),
557 lowerDescription, occurrencesDescription);
558 }
559 else if (words[index1].find("TITLE ") == 0)
560 {
561 value += find(words[index1].substr(6),
562 lowerDescription, occurrencesDescription);
563 }
564 else if (words[index1].find("TEXT ") == 0)
565 {
566 value += find(words[index1].substr(5),
567 lowerDescription, occurrencesDescription);
568 }
569 else
570 {
571 value += find(words[index1], lowerDescription,
572 occurrencesDescription);
573 }
574 }
575 }
576
577 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
578 {
579 string lowerHeading = string(getHeadings()[index2].length(),
580 ' ');
581 for (unsigned number = 0; number <
582 getHeadings()[index2].length(); number++)
583 {
584 lowerHeading[number] = tolower(
585 getHeadings()[index2][number]);
586 }
587
588 for (unsigned number0 = 0; number0 < required.size(); number0++)
589 {
590 if (required[number0].find("URL ") == 0)
591 {
592 value += find(required[number0].substr(4),
593 lowerHeading);
594 }
595 else if (required[number0].find("TITLE ") == 0)
596 {
597 value += find(required[number0].substr(6),
598 lowerHeading);
599 }
600 else if (required[number0].find("TEXT ") == 0)
601 {
602 value += find(required[number0].substr(5),
603 lowerHeading);
604 }
605 else
606 {
607 value += find(required[number0], lowerHeading);
608 }
609 }
610
611 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
612 {
613 vector<string> words;
614
615 unsigned begin = 0, found;
616 do
617 {
618 found = eitherOr[number1].find(" OR ", begin);
619
620 if (found != string::npos)
621 {
622 words.push_back(eitherOr[number1].substr(begin,
623 found - begin));
624 }
625 else
626 {
627 words.push_back(eitherOr[number1].substr(begin));
628 }
629
630 begin = found + 4;
631 }
632 while (begin < eitherOr[number1].length() && found !=
633 string::npos);
634
635 for (unsigned number = 0; number < words.size(); number++)
636 {
637 if (words[number].find("URL ") == 0)
638 {
639 value += find(words[number].substr(4),
640 lowerHeading);
641 }
642 else if (words[number].find("TITLE ") == 0)
643 {
644 value += find(words[number].substr(6),
645 lowerHeading);
646 }
647 else if (words[number].find("TEXT ") == 0)
648 {
649 value += find(words[number].substr(5),
650 lowerHeading);
651 }
652 else
653 {
654 value += find(words[number], lowerHeading);
655 }
656 }
657 }
658 }
659 }
660 }
661 }
662
663 void Ranker::checkRequired()
664 {
665 vector<unsigned> inURLs, inTitles, inTexts;
666
667 for (unsigned index = 0; index < required.size(); index++)
668 {
669 unsigned inURL = 0, inTitle = 0, inText = 0;
670
671 if (required[index].find("URL ") == 0)
672 {
673 inURL = find(required[index].substr(4), lowerURL.substr(7));
674
675 if (inURL)
676 {
677 inTitle = find(required[index].substr(4), lowerTitle,
678 occurrencesTitle);
679 inText = find(required[index].substr(4), lowerText,
680 occurrencesText);
681
682 if (!inTitle) inTitle++;
683 if (!inText) inText++;
684 }
685 }
686 else if (required[index].find("TITLE ") == 0)
687 {
688 inTitle = find(required[index].substr(6), lowerTitle,
689 occurrencesTitle);
690
691 if (inTitle)
692 {
693 inURL = find(required[index].substr(6), lowerURL.substr(7));
694 inText = find(required[index].substr(6), lowerText,
695 occurrencesText);
696
697 if (!inURL) inURL++;
698 if (!inText) inText++;
699 }
700 }
701 else if (required[index].find("TEXT ") == 0)
702 {
703 inText = find(required[index].substr(5), lowerText,
704 occurrencesText);
705
706 if (inText)
707 {
708 inURL = find(required[index].substr(5), lowerURL.substr(7));
709 inTitle = find(required[index].substr(5), lowerTitle,
710 occurrencesTitle);
711
712 if (!inURL) inURL++;
713 if (!inTitle) inTitle++;
714 }
715 }
716 else
717 {
718 inURL = find(required[index], lowerURL.substr(7));
719 inTitle = find(required[index], lowerTitle, occurrencesTitle);
720 inText = find(required[index], lowerText, occurrencesText);
721 }
722
723 inURLs.push_back(inURL);
724 inTitles.push_back(inTitle);
725 inTexts.push_back(inText);
726 }
727
728 unsigned inURL = evaluate(inURLs);
729 unsigned inTitle = evaluate(inTitles);
730 unsigned inText = evaluate(inTexts);
731
732 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
733 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
734 inText : 0;
735 }
736
737 void Ranker::checkExcluded()
738 {
739 vector<unsigned> inURLs, inTitles, inTexts;
740
741 for (unsigned index = 0; index < excluded.size(); index++)
742 {
743 unsigned inURL = 0, inTitle = 0, inText = 0;
744
745 inURL = find(excluded[index], lowerURL.substr(7));
746 inTitle = find(excluded[index], lowerTitle);
747 inText = find(excluded[index], lowerText);
748
749 inURLs.push_back(inURL);
750 inTitles.push_back(inTitle);
751 inTexts.push_back(inText);
752 }
753
754 unsigned inURL = evaluate(inURLs);
755 unsigned inTitle = evaluate(inTitles);
756 unsigned inText = evaluate(inTexts);
757
758 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
759 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
760 inText : 0;
761 }
762
763 void Ranker::checkEitherOr()
764 {
765 vector<unsigned> inURLs, inTitles, inTexts;
766
767 for (unsigned index = 0; index < eitherOr.size(); index++)
768 {
769 vector<unsigned> inURLz, inTitlez, inTextz;
770 unsigned inURL = 0, inTitle = 0, inText = 0;
771 vector<string> words;
772
773 unsigned begin = 0, found;
774 do
775 {
776 found = eitherOr[index].find(" OR ", begin);
777
778 if (found != string::npos)
779 {
780 words.push_back(eitherOr[index].substr(begin, found - begin));
781 }
782 else
783 {
784 words.push_back(eitherOr[index].substr(begin));
785 }
786
787 begin = found + 4;
788 }
789 while (begin < eitherOr[index].length() && found != string::npos);
790
791 for (unsigned number = 0; number < words.size(); number++)
792 {
793 unsigned inURL = 0, inTitle = 0, inText = 0;
794
795 if (words[number].find("URL ") == 0)
796 {
797 inURL = find(words[number].substr(4), lowerURL.substr(7));
798
799 if (inURL)
800 {
801 inTitle = find(words[number].substr(4), lowerTitle,
802 occurrencesTitle);
803 inText = find(words[number].substr(4), lowerText,
804 occurrencesText);
805
806 if (!inTitle) inTitle++;
807 if (!inText) inText++;
808 }
809 }
810 else if (words[number].find("TITLE ") == 0)
811 {
812 inTitle = find(words[number].substr(6), lowerTitle,
813 occurrencesTitle);
814
815 if (inTitle)
816 {
817 inURL = find(words[number].substr(6), lowerURL.substr(7));
818 inText = find(words[number].substr(6), lowerText,
819 occurrencesText);
820
821 if (!inURL) inURL++;
822 if (!inText) inText++;
823 }
824 }
825 else if (words[number].find("TEXT ") == 0)
826 {
827 inText = find(words[number].substr(5), lowerText,
828 occurrencesText);
829
830 if (inText)
831 {
832 inURL = find(words[number].substr(5), lowerURL.substr(7));
833 inTitle = find(words[number].substr(5), lowerTitle,
834 occurrencesTitle);
835
836 if (!inURL) inURL++;
837 if (!inTitle) inTitle++;
838 }
839 }
840 else
841 {
842 inURL = find(words[number], lowerURL.substr(7));
843 inTitle = find(words[number], lowerTitle, occurrencesTitle);
844 inText = find(words[number], lowerText, occurrencesText);
845 }
846
847 inURLz.push_back(inURL);
848 inTitlez.push_back(inTitle);
849 inTextz.push_back(inText);
850 }
851
852 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
853 {
854 inURL += inURLz[number0];
855 }
856
857 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
858 {
859 inTitle += inTitlez[number1];
860 }
861
862 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
863 {
864 inText += inTextz[number2];
865 }
866
867 inURLs.push_back(inURL);
868 inTitles.push_back(inTitle);
869 inTexts.push_back(inText);
870
871 inURLz.clear();
872 inTitlez.clear();
873 inTextz.clear();
874 words.clear();
875 }
876
877 unsigned inURL = evaluate(inURLs);
878 unsigned inTitle = evaluate(inTitles);
879 unsigned inText = evaluate(inTexts);
880
881 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
882 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
883 inText : 0;
884 }
885
886 unsigned Ranker::find(string word, const string& where)
887 {
888 unsigned value = 0;
889
890 decrap(word);
891
892 if (word == "")
893 {
894 // this can happen if a word is all crap characters
895 value++;
896 }
897 else if (word.find_first_of(" \n ") == string::npos)
898 {
899 unsigned begin = 0, found;
900 do
901 {
902 found = where.find(word, begin);
903
904 if (found != string::npos)
905 {
906 bool isBefore, isAfter, before = false, after = false;
907 isBefore = found - 1 > 0;
908 isAfter = found + word.length() < where.length();
909
910 if (isBefore) before = isalnum(where[found - 1]) != 0;
911 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
912
913 if (!before && !after)
914 {
915 value++;
916 }
917 }
918
919 begin = found + word.length();
920 }
921 while (found != string::npos && begin < where.length());
922 }
923 else
924 {
925 value = phrase(word, where);
926 }
927
928 return value;
929 }
930
931 unsigned Ranker::find(string word, const string& where, map<unsigned,
932 unsigned>& occurrences)
933 {
934 unsigned value = 0;
935
936 decrap(word);
937
938 if (word == "")
939 {
940 // this can happen if a word is all crap characters
941 value++;
942 }
943 else if (word.find_first_of(" \n ") == string::npos)
944 {
945 unsigned begin = 0, found;
946 do
947 {
948 found = where.find(word, begin);
949
950 if (found != string::npos)
951 {
952 bool isBefore, isAfter, before = false, after = false;
953 isBefore = found - 1 > 0;
954 isAfter = found + word.length() < where.length();
955
956 if (isBefore) before = isalnum(where[found - 1]) != 0;
957 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
958
959 if (!before && !after)
960 {
961 value++;
962
963 occurrences.insert(pair<unsigned, unsigned>(found,
964 word.length()));
965 }
966 }
967
968 begin = found + word.length();
969 }
970 while (found != string::npos && begin < where.length());
971 }
972 else
973 {
974 value = phrase(word, where, occurrences);
975 }
976
977 return value;
978 }
979
980 unsigned Ranker::phrase(const string& phrase, const string& where)
981 {
982 unsigned value = 0;
983 vector<string> words;
984
985 unsigned begin = 0, space;
986 do
987 {
988 space = phrase.find(' ', begin);
989
990 words.push_back(phrase.substr(begin, space - begin));
991
992 begin = space + 1;
993 }
994 while (space != string::npos && begin < phrase.length());
995
996 begin = 0;
997 unsigned counter = 0;
998 do
999 {
1000 value += this->phrase(words, 0, begin, true, where);
1001 }
1002 while (begin < where.length());
1003
1004 return value;
1005 }
1006
1007 unsigned Ranker::phrase(const string& phrase, const string& where,
1008 map<unsigned, unsigned>& occurrences)
1009 {
1010 unsigned value = 0;
1011 vector<string> words;
1012
1013 unsigned begin = 0, space;
1014 do
1015 {
1016 space = phrase.find(' ', begin);
1017
1018 words.push_back(phrase.substr(begin, space - begin));
1019
1020 begin = space + 1;
1021 }
1022 while (space != string::npos && begin < phrase.length());
1023
1024 begin = 0;
1025 do
1026 {
1027 value += this->phrase(words, 0, begin, true, where, occurrences);
1028 }
1029 while (begin < where.length());
1030
1031 return value;
1032 }
1033
1034 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1035 begin, bool start, const string& where)
1036 {
1037 unsigned value = 0;
1038 bool end = !(word + 1 < words.size());
1039 unsigned found = where.find(words[word], begin);
1040 unsigned newBegin = found + words[word].length();
1041
1042 if (found != string::npos)
1043 {
1044 bool isBefore, isAfter, before = false, after = false;
1045 isBefore = found - 1 > 0;
1046 isAfter = found + words[word].length() < where.length();
1047
1048 if (isBefore) before = isalnum(where[found - 1]) != 0;
1049 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1050
1051 if (!before && !after)
1052 {
1053 bool between = true;
1054 if (!start)
1055 {
1056 for (unsigned index = begin + 1; index < found - 1; index++)
1057 {
1058 if (isalnum(where[index]))
1059 {
1060 between = false;
1061 break;
1062 }
1063 }
1064 }
1065
1066 if (between)
1067 {
1068 if (end)
1069 {
1070 begin = newBegin;
1071 value = 1;
1072 }
1073 else
1074 {
1075 value = phrase(words, (word + 1), newBegin, false, where);
1076 }
1077 }
1078 }
1079 }
1080
1081 if (start)
1082 {
1083 if (found != string::npos)
1084 {
1085 begin = newBegin;
1086 }
1087 else
1088 {
1089 begin = string::npos;
1090 }
1091 }
1092
1093 return value;
1094 }
1095
1096 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1097 begin, bool start, const string& where, map<unsigned, unsigned>&
1098 occurrences)
1099 {
1100 unsigned value = 0;
1101 bool end = !(word + 1 < words.size());
1102 unsigned found = where.find(words[word], begin);
1103 unsigned newBegin = found + words[word].length();
1104
1105 if (found != string::npos)
1106 {
1107 bool isBefore, isAfter, before = false, after = false;
1108 isBefore = found - 1 > 0;
1109 isAfter = found + words[word].length() < where.length();
1110
1111 if (isBefore) before = isalnum(where[found - 1]) != 0;
1112 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1113
1114 if (!before && !after)
1115 {
1116 bool between = true;
1117 if (!start)
1118 {
1119 for (unsigned index = begin + 1; index < found - 1; index++)
1120 {
1121 if (isalnum(where[index]))
1122 {
1123 between = false;
1124 break;
1125 }
1126 }
1127 }
1128
1129 if (between)
1130 {
1131 occurrences.insert(pair<unsigned, unsigned>(found,
1132 words[word].length()));
1133
1134 if (end)
1135 {
1136 begin = newBegin;
1137 value = 1;
1138 }
1139 else
1140 {
1141 value = phrase(words, (word + 1), newBegin, false, where,
1142 occurrences);
1143 }
1144 }
1145 }
1146 }
1147
1148 if (start)
1149 {
1150 if (found != string::npos)
1151 {
1152 begin = newBegin;
1153 }
1154 else
1155 {
1156 begin = string::npos;
1157 }
1158 }
1159
1160 return value;
1161 }
1162
1163 unsigned Ranker::evaluate(vector<unsigned>& ins)
1164 {
1165 unsigned in = 0;
1166
1167 for (unsigned index = 0; index < ins.size(); index++)
1168 {
1169 if (ins[index] > 0)
1170 {
1171 in += ins[index];
1172 }
1173 else
1174 {
1175 in = 0;
1176 break;
1177 }
1178 }
1179
1180 return in;
1181 }
1182
1183 void Ranker::decrap(string& crap)
1184 {
1185 unsigned begin = 0, found;
1186 do
1187 {
1188 // &, _, +, and # are not considered crap
1189 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1190
1191 if (found != string::npos)
1192 {
1193 crap[found] = ' ';
1194 }
1195
1196 begin = found + 1;
1197 }
1198 while (found != string::npos && begin < crap.length());
1199
1200 normalize(crap);
1201 }

Properties

Name Value
svn:eol-style native
svn:keywords Id