ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 341
Committed: 2004-04-20T13:20:56-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 27192 byte(s)
Log Message:
Fixed sample output so it's not empty ever.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Ranker.hpp"
52
53 Ranker::Ranker(Page& page) : Page(page)
54 {
55 value = 0;
56 requiredValue = 0;
57 excludedValue = 0;
58 eitherOrValue = 0;
59 allIn = all;
60 }
61
62 void Ranker::rank(vector<string> query)
63 {
64 vector<string> prep;
65
66 for (unsigned index = 0; index < query.size(); index++)
67 {
68 if (query[index] == "allintitle:" && index == 0)
69 {
70 allIn = title;
71 }
72 else if (query[index] == "allinurl:" && index == 0)
73 {
74 allIn = url;
75 }
76 else if (query[index] == "allintext:" && index == 0)
77 {
78 allIn = text;
79 }
80 else if (query[index].find("site:") == 0 && query[index].size() > 5)
81 {
82 site = query[index].substr(5);
83 }
84 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
85 {
86 prep.push_back("TITLE " + query[index].substr(8));
87 }
88 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
89 {
90 prep.push_back("URL " + query[index].substr(6));
91 }
92 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
93 {
94 prep.push_back("TEXT " + query[index].substr(7));
95 }
96 else
97 {
98 prep.push_back(query[index]);
99 }
100 }
101
102 if (prep.size() > 0)
103 {
104 bool or_ = false;
105 for (unsigned index = 0; index < prep.size(); index++)
106 {
107 bool exclude = false;
108 if (prep[index].find('+') == 0)
109 {
110 prep[index].erase(0, 1);
111 }
112 else if (prep[index].find('-') == 0)
113 {
114 exclude = true;
115 prep[index].erase(0, 1);
116 }
117
118 if (or_)
119 {
120 if (prep[index].find(" OR") == string::npos)
121 {
122 or_ = false;
123 }
124
125 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
126 }
127 else if (exclude)
128 {
129 excluded.push_back(prep[index]);
130 }
131 else if (prep[index].find(" OR") != string::npos)
132 {
133 or_ = true;
134 eitherOr.push_back(prep[index]);
135 }
136 else
137 {
138 required.push_back(prep[index]);
139 }
140 }
141 }
142
143 rank();
144 }
145
146 void Ranker::setSample()
147 {
148 map<unsigned, unsigned>::iterator itor;
149 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
150
151 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
152 {
153 unsigned distance;
154
155 if (++itor != occurrencesText.end())
156 {
157 unsigned next = itor->first;
158
159 itor--;
160 distance = next - (itor->first + itor->second);
161 }
162 else
163 {
164 distance = string::npos;
165 itor--;
166 }
167
168 distances.insert(pair<unsigned, map<unsigned,
169 unsigned>::iterator>(distance, itor));
170 }
171
172 if (distances.begin() != distances.end())
173 {
174 itor = distances.begin()->second;
175 }
176
177 string portion;
178 unsigned sampleLength = 0, begin = 0, end = string::npos;
179
180 while (sampleLength < sampleMax && itor != occurrencesText.end())
181 {
182 unsigned found = itor->first, length = itor->second;
183
184 for (unsigned index = found; index > begin; index--)
185 {
186 if (found - index >= sampleMax - sampleLength - length)
187 {
188 for (; index < found; index++)
189 {
190 if (isspace(getText()[index])) break;
191 }
192
193 begin = index + 1;
194
195 break;
196 }
197 else if ((index > begin ? (isupper(getText()[index]) &&
198 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
199 index != found)
200 {
201 begin = index;
202
203 break;
204 }
205 }
206
207 if (end + 1 != begin) sample += " <strong>...</strong> ";
208
209 portion = getText().substr(begin, found - begin);
210 sampleLength += portion.length();
211
212 entities(portion, '&', "&amp;");
213 entities(portion, '\"', "&quot;");
214 entities(portion, '<', "&lt;");
215 entities(portion, '>', "&gt;");
216
217 sample += portion + "<strong>";
218 portion = getText().substr(found, length);
219 sampleLength += portion.length();
220
221 entities(portion, '&', "&amp;");
222 entities(portion, '\"', "&quot;");
223 entities(portion, '<', "&lt;");
224 entities(portion, '>', "&gt;");
225
226 sample += portion + "</strong>";
227 begin = found + length;
228 end = begin - 1;
229
230 if (++itor != occurrencesText.end())
231 {
232 if (itor->first + itor->second < begin + sampleMax - sampleLength)
233 {
234 portion = getText().substr(begin, itor->first - begin);
235 sampleLength += portion.length();
236
237 entities(portion, '&', "&amp;");
238 entities(portion, '\"', "&quot;");
239 entities(portion, '<', "&lt;");
240 entities(portion, '>', "&gt;");
241
242 sample += portion;
243 begin = itor->first;
244 end = begin - 1;
245 }
246 else
247 {
248 for (end = begin + sampleMax - sampleLength; end > begin;
249 end--)
250 {
251 if (isspace(getText()[end])) break;
252 }
253
254 portion = getText().substr(begin, end - begin + 1);
255 sampleLength += portion.length();
256
257 entities(portion, '&', "&amp;");
258 entities(portion, '\"', "&quot;");
259 entities(portion, '<', "&lt;");
260 entities(portion, '>', "&gt;");
261
262 sample += portion + " <strong>...</strong>";
263
264 break;
265 }
266 }
267 else
268 {
269 for (end = begin + sampleMax - sampleLength; end > begin && (end +
270 1 < getText().length()); end--)
271 {
272 if (isspace(getText()[end])) break;
273 }
274
275 if (end >= getText().length()) end = getText().length() - 1;
276
277 portion = getText().substr(begin, end - begin + 1);
278 sampleLength += portion.length();
279
280 entities(portion, '&', "&amp;");
281 entities(portion, '\"', "&quot;");
282 entities(portion, '<', "&lt;");
283 entities(portion, '>', "&gt;");
284
285 sample += portion;
286
287 if (end + 1 < getText().length())
288 {
289 sample += " <strong>...</strong>";
290 }
291
292 break;
293 }
294 }
295
296 if (sample == "")
297 {
298 for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--)
299 {
300 if (isspace(getText()[end])) break;
301 }
302
303 sample = getText().substr(0, end + 1);
304
305 entities(sample, '&', "&amp;");
306 entities(sample, '\"', "&quot;");
307 entities(sample, '<', "&lt;");
308 entities(sample, '>', "&gt;");
309
310 if (end + 1 < getText().length())
311 {
312 sample += " <strong>...</strong>";
313 }
314 }
315 }
316
317 string Ranker::getTitle()
318 {
319 string title, portion;
320
321 unsigned begin = 0;
322 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
323 itor != occurrencesTitle.end(); itor++)
324 {
325 unsigned found = itor->first;
326 unsigned length = itor->second;
327
328 portion = Page::getTitle().substr(begin, found - begin);
329
330 entities(portion, '&', "&amp;");
331 entities(portion, '\"', "&quot;");
332 entities(portion, '<', "&lt;");
333 entities(portion, '>', "&gt;");
334
335 title += portion + "<strong>";
336
337 portion = Page::getTitle().substr(found, length);
338
339 entities(portion, '&', "&amp;");
340 entities(portion, '\"', "&quot;");
341 entities(portion, '<', "&lt;");
342 entities(portion, '>', "&gt;");
343
344 title += portion + "</strong>";
345
346 begin = found + length;
347 }
348
349 portion = Page::getTitle().substr(begin);
350
351 entities(portion, '&', "&amp;");
352 entities(portion, '\"', "&quot;");
353 entities(portion, '<', "&lt;");
354 entities(portion, '>', "&gt;");
355
356 title += portion;
357
358 return title;
359 }
360
361 string Ranker::getDescription()
362 {
363 string description, portion;
364
365 unsigned begin = 0;
366 for (map<unsigned, unsigned>::iterator itor =
367 occurrencesDescription.begin(); itor != occurrencesDescription.end();
368 itor++)
369 {
370 unsigned found = itor->first;
371 unsigned length = itor->second;
372
373 portion = Page::getDescription().substr(begin, found - begin);
374
375 entities(portion, '&', "&amp;");
376 entities(portion, '\"', "&quot;");
377 entities(portion, '<', "&lt;");
378 entities(portion, '>', "&gt;");
379
380 description += portion + "<strong>";
381
382 portion = Page::getDescription().substr(found, length);
383
384 entities(portion, '&', "&amp;");
385 entities(portion, '\"', "&quot;");
386 entities(portion, '<', "&lt;");
387 entities(portion, '>', "&gt;");
388
389 description += portion + "</strong>";
390
391 begin = found + length;
392 }
393
394 portion = Page::getDescription().substr(begin);
395
396 entities(portion, '&', "&amp;");
397 entities(portion, '\"', "&quot;");
398 entities(portion, '<', "&lt;");
399 entities(portion, '>', "&gt;");
400
401 description += portion;
402
403 return description;
404 }
405
406 bool Ranker::operator==(const unsigned number) const
407 {
408 return value == number;
409 }
410
411 bool Ranker::operator==(const Ranker& ranker) const
412 {
413 return value == ranker.value;
414 }
415
416 bool Ranker::operator!=(const unsigned number) const
417 {
418 return value != number;
419 }
420
421 bool Ranker::operator!=(const Ranker& ranker) const
422 {
423 return value != ranker.value;
424 }
425
426 bool Ranker::operator<(const unsigned number) const
427 {
428 return value < number;
429 }
430
431 bool Ranker::operator<(const Ranker& ranker) const
432 {
433 return value < ranker.value;
434 }
435
436 bool Ranker::operator>(const unsigned number) const
437 {
438 return value > number;
439 }
440
441 bool Ranker::operator >(const Ranker& ranker) const
442 {
443 return value > ranker.value;
444 }
445
446 void Ranker::rank()
447 {
448 lowerAddress = tolower(getAddress());
449
450 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
451 site.length())
452 {
453 bool isRequired = required.size() > 0;
454 bool isExcluded = excluded.size() > 0;
455 bool isEitherOr = eitherOr.size() > 0;
456
457 lowerURL = tolower(getURL());
458 lowerTitle = tolower(Page::getTitle());
459 lowerText = tolower(Page::getText());
460
461 if (isRequired) checkRequired();
462 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
463 if (isEitherOr) checkEitherOr();
464
465 if (isRequired && isExcluded && isEitherOr)
466 {
467 value += requiredValue && !excludedValue && eitherOrValue ?
468 requiredValue + eitherOrValue : 0;
469 }
470 else if (isRequired && isExcluded)
471 {
472 value += requiredValue && !excludedValue ? requiredValue : 0;
473 }
474 else if (isRequired && isEitherOr)
475 {
476 value += requiredValue && eitherOrValue ? requiredValue +
477 eitherOrValue : 0;
478 }
479 else if (isExcluded && isEitherOr)
480 {
481 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
482 }
483 else if (isRequired)
484 {
485 value += requiredValue;
486 }
487 else if (isEitherOr)
488 {
489 value += eitherOrValue;
490 }
491 else
492 {
493 // do nothing this is a bad search and warrants no results
494 }
495
496 if (value > 0)
497 {
498 string lowerDescription = tolower(Page::getDescription());
499
500 for (unsigned index = 0; index < required.size(); index++)
501 {
502 if (required[index].find("URL ") == 0)
503 {
504 value += find(required[index].substr(4), lowerDescription,
505 occurrencesDescription);
506 }
507 else if (required[index].find("TITLE ") == 0)
508 {
509 value += find(required[index].substr(6), lowerDescription,
510 occurrencesDescription);
511 }
512 else if (required[index].find("TEXT ") == 0)
513 {
514 value += find(required[index].substr(5), lowerDescription,
515 occurrencesDescription);
516 }
517 else
518 {
519 value += find(required[index], lowerDescription,
520 occurrencesDescription);
521 }
522 }
523
524 for (unsigned index1 = 0; index1 < eitherOr.size(); index1++)
525 {
526 vector<string> words;
527
528 unsigned begin = 0, found;
529 do
530 {
531 found = eitherOr[index1].find(" OR ", begin);
532
533 if (found != string::npos)
534 {
535 words.push_back(eitherOr[index1].substr(begin, found -
536 begin));
537 }
538 else
539 {
540 words.push_back(eitherOr[index1].substr(begin));
541 }
542
543 begin = found + 4;
544 }
545 while (begin < eitherOr[index1].length() && found !=
546 string::npos);
547
548 for (unsigned number = 0; number < words.size(); number++)
549 {
550 if (words[index1].find("URL ") == 0)
551 {
552 value += find(words[index1].substr(4),
553 lowerDescription, occurrencesDescription);
554 }
555 else if (words[index1].find("TITLE ") == 0)
556 {
557 value += find(words[index1].substr(6),
558 lowerDescription, occurrencesDescription);
559 }
560 else if (words[index1].find("TEXT ") == 0)
561 {
562 value += find(words[index1].substr(5),
563 lowerDescription, occurrencesDescription);
564 }
565 else
566 {
567 value += find(words[index1], lowerDescription,
568 occurrencesDescription);
569 }
570 }
571 }
572
573 for (unsigned index2 = 0; index2 < getHeadings().size(); index2++)
574 {
575 string lowerHeading = string(getHeadings()[index2].length(),
576 ' ');
577 for (unsigned number = 0; number <
578 getHeadings()[index2].length(); number++)
579 {
580 lowerHeading[number] = tolower(
581 getHeadings()[index2][number]);
582 }
583
584 for (unsigned number0 = 0; number0 < required.size(); number0++)
585 {
586 if (required[number0].find("URL ") == 0)
587 {
588 value += find(required[number0].substr(4),
589 lowerHeading);
590 }
591 else if (required[number0].find("TITLE ") == 0)
592 {
593 value += find(required[number0].substr(6),
594 lowerHeading);
595 }
596 else if (required[number0].find("TEXT ") == 0)
597 {
598 value += find(required[number0].substr(5),
599 lowerHeading);
600 }
601 else
602 {
603 value += find(required[number0], lowerHeading);
604 }
605 }
606
607 for (unsigned number1 = 0; number1 < eitherOr.size(); number1++)
608 {
609 vector<string> words;
610
611 unsigned begin = 0, found;
612 do
613 {
614 found = eitherOr[number1].find(" OR ", begin);
615
616 if (found != string::npos)
617 {
618 words.push_back(eitherOr[number1].substr(begin,
619 found - begin));
620 }
621 else
622 {
623 words.push_back(eitherOr[number1].substr(begin));
624 }
625
626 begin = found + 4;
627 }
628 while (begin < eitherOr[number1].length() && found !=
629 string::npos);
630
631 for (unsigned number = 0; number < words.size(); number++)
632 {
633 if (words[number].find("URL ") == 0)
634 {
635 value += find(words[number].substr(4),
636 lowerHeading);
637 }
638 else if (words[number].find("TITLE ") == 0)
639 {
640 value += find(words[number].substr(6),
641 lowerHeading);
642 }
643 else if (words[number].find("TEXT ") == 0)
644 {
645 value += find(words[number].substr(5),
646 lowerHeading);
647 }
648 else
649 {
650 value += find(words[number], lowerHeading);
651 }
652 }
653 }
654 }
655 }
656 }
657 }
658
659 void Ranker::checkRequired()
660 {
661 vector<unsigned> inURLs, inTitles, inTexts;
662
663 for (unsigned index = 0; index < required.size(); index++)
664 {
665 unsigned inURL = 0, inTitle = 0, inText = 0;
666
667 if (required[index].find("URL ") == 0)
668 {
669 inURL = find(required[index].substr(4), lowerURL.substr(7));
670
671 if (inURL)
672 {
673 inTitle = find(required[index].substr(4), lowerTitle,
674 occurrencesTitle);
675 inText = find(required[index].substr(4), lowerText,
676 occurrencesText);
677
678 if (!inTitle) inTitle++;
679 if (!inText) inText++;
680 }
681 }
682 else if (required[index].find("TITLE ") == 0)
683 {
684 inTitle = find(required[index].substr(6), lowerTitle,
685 occurrencesTitle);
686
687 if (inTitle)
688 {
689 inURL = find(required[index].substr(6), lowerURL.substr(7));
690 inText = find(required[index].substr(6), lowerText,
691 occurrencesText);
692
693 if (!inURL) inURL++;
694 if (!inText) inText++;
695 }
696 }
697 else if (required[index].find("TEXT ") == 0)
698 {
699 inText = find(required[index].substr(5), lowerText,
700 occurrencesText);
701
702 if (inText)
703 {
704 inURL = find(required[index].substr(5), lowerURL.substr(7));
705 inTitle = find(required[index].substr(5), lowerTitle,
706 occurrencesTitle);
707
708 if (!inURL) inURL++;
709 if (!inTitle) inTitle++;
710 }
711 }
712 else
713 {
714 inURL = find(required[index], lowerURL.substr(7));
715 inTitle = find(required[index], lowerTitle, occurrencesTitle);
716 inText = find(required[index], lowerText, occurrencesText);
717 }
718
719 inURLs.push_back(inURL);
720 inTitles.push_back(inTitle);
721 inTexts.push_back(inText);
722 }
723
724 unsigned inURL = evaluate(inURLs);
725 unsigned inTitle = evaluate(inTitles);
726 unsigned inText = evaluate(inTexts);
727
728 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
729 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
730 inText : 0;
731 }
732
733 void Ranker::checkExcluded()
734 {
735 vector<unsigned> inURLs, inTitles, inTexts;
736
737 for (unsigned index = 0; index < excluded.size(); index++)
738 {
739 unsigned inURL = 0, inTitle = 0, inText = 0;
740
741 inURL = find(excluded[index], lowerURL.substr(7));
742 inTitle = find(excluded[index], lowerTitle);
743 inText = find(excluded[index], lowerText);
744
745 inURLs.push_back(inURL);
746 inTitles.push_back(inTitle);
747 inTexts.push_back(inText);
748 }
749
750 unsigned inURL = evaluate(inURLs);
751 unsigned inTitle = evaluate(inTitles);
752 unsigned inText = evaluate(inTexts);
753
754 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
755 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
756 inText : 0;
757 }
758
759 void Ranker::checkEitherOr()
760 {
761 vector<unsigned> inURLs, inTitles, inTexts;
762
763 for (unsigned index = 0; index < eitherOr.size(); index++)
764 {
765 vector<unsigned> inURLz, inTitlez, inTextz;
766 unsigned inURL = 0, inTitle = 0, inText = 0;
767 vector<string> words;
768
769 unsigned begin = 0, found;
770 do
771 {
772 found = eitherOr[index].find(" OR ", begin);
773
774 if (found != string::npos)
775 {
776 words.push_back(eitherOr[index].substr(begin, found - begin));
777 }
778 else
779 {
780 words.push_back(eitherOr[index].substr(begin));
781 }
782
783 begin = found + 4;
784 }
785 while (begin < eitherOr[index].length() && found != string::npos);
786
787 for (unsigned number = 0; number < words.size(); number++)
788 {
789 unsigned inURL = 0, inTitle = 0, inText = 0;
790
791 if (words[number].find("URL ") == 0)
792 {
793 inURL = find(words[number].substr(4), lowerURL.substr(7));
794
795 if (inURL)
796 {
797 inTitle = find(words[number].substr(4), lowerTitle,
798 occurrencesTitle);
799 inText = find(words[number].substr(4), lowerText,
800 occurrencesText);
801
802 if (!inTitle) inTitle++;
803 if (!inText) inText++;
804 }
805 }
806 else if (words[number].find("TITLE ") == 0)
807 {
808 inTitle = find(words[number].substr(6), lowerTitle,
809 occurrencesTitle);
810
811 if (inTitle)
812 {
813 inURL = find(words[number].substr(6), lowerURL.substr(7));
814 inText = find(words[number].substr(6), lowerText,
815 occurrencesText);
816
817 if (!inURL) inURL++;
818 if (!inText) inText++;
819 }
820 }
821 else if (words[number].find("TEXT ") == 0)
822 {
823 inText = find(words[number].substr(5), lowerText,
824 occurrencesText);
825
826 if (inText)
827 {
828 inURL = find(words[number].substr(5), lowerURL.substr(7));
829 inTitle = find(words[number].substr(5), lowerTitle,
830 occurrencesTitle);
831
832 if (!inURL) inURL++;
833 if (!inTitle) inTitle++;
834 }
835 }
836 else
837 {
838 inURL = find(words[number], lowerURL.substr(7));
839 inTitle = find(words[number], lowerTitle, occurrencesTitle);
840 inText = find(words[number], lowerText, occurrencesText);
841 }
842
843 inURLz.push_back(inURL);
844 inTitlez.push_back(inTitle);
845 inTextz.push_back(inText);
846 }
847
848 for (unsigned number0 = 0; number0 < inURLz.size(); number0++)
849 {
850 inURL += inURLz[number0];
851 }
852
853 for (unsigned number1 = 0; number1 < inTitlez.size(); number1++)
854 {
855 inTitle += inTitlez[number1];
856 }
857
858 for (unsigned number2 = 0; number2 < inTextz.size(); number2++)
859 {
860 inText += inTextz[number2];
861 }
862
863 inURLs.push_back(inURL);
864 inTitles.push_back(inTitle);
865 inTexts.push_back(inText);
866
867 inURLz.clear();
868 inTitlez.clear();
869 inTextz.clear();
870 words.clear();
871 }
872
873 unsigned inURL = evaluate(inURLs);
874 unsigned inTitle = evaluate(inTitles);
875 unsigned inText = evaluate(inTexts);
876
877 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
878 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
879 inText : 0;
880 }
881
882 unsigned Ranker::find(string word, const string& where)
883 {
884 unsigned value = 0;
885
886 decrap(word);
887
888 if (word == "")
889 {
890 // this can happen if a word is all crap characters
891 value++;
892 }
893 else if (word.find_first_of(" \n ") == string::npos)
894 {
895 unsigned begin = 0, found;
896 do
897 {
898 found = where.find(word, begin);
899
900 if (found != string::npos)
901 {
902 bool isBefore, isAfter, before = false, after = false;
903 isBefore = found - 1 > 0;
904 isAfter = found + word.length() < where.length();
905
906 if (isBefore) before = isalnum(where[found - 1]) != 0;
907 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
908
909 if (!before && !after)
910 {
911 value++;
912 }
913 }
914
915 begin = found + word.length();
916 }
917 while (found != string::npos && begin < where.length());
918 }
919 else
920 {
921 value = phrase(word, where);
922 }
923
924 return value;
925 }
926
927 unsigned Ranker::find(string word, const string& where, map<unsigned,
928 unsigned>& occurrences)
929 {
930 unsigned value = 0;
931
932 decrap(word);
933
934 if (word == "")
935 {
936 // this can happen if a word is all crap characters
937 value++;
938 }
939 else if (word.find_first_of(" \n ") == string::npos)
940 {
941 unsigned begin = 0, found;
942 do
943 {
944 found = where.find(word, begin);
945
946 if (found != string::npos)
947 {
948 bool isBefore, isAfter, before = false, after = false;
949 isBefore = found - 1 > 0;
950 isAfter = found + word.length() < where.length();
951
952 if (isBefore) before = isalnum(where[found - 1]) != 0;
953 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
954
955 if (!before && !after)
956 {
957 value++;
958
959 occurrences.insert(pair<unsigned, unsigned>(found,
960 word.length()));
961 }
962 }
963
964 begin = found + word.length();
965 }
966 while (found != string::npos && begin < where.length());
967 }
968 else
969 {
970 value = phrase(word, where, occurrences);
971 }
972
973 return value;
974 }
975
976 unsigned Ranker::phrase(const string& phrase, const string& where)
977 {
978 unsigned value = 0;
979 vector<string> words;
980
981 unsigned begin = 0, space;
982 do
983 {
984 space = phrase.find(' ', begin);
985
986 words.push_back(phrase.substr(begin, space - begin));
987
988 begin = space + 1;
989 }
990 while (space != string::npos && begin < phrase.length());
991
992 begin = 0;
993 unsigned counter = 0;
994 do
995 {
996 value += this->phrase(words, 0, begin, true, where);
997 }
998 while (begin < where.length());
999
1000 return value;
1001 }
1002
1003 unsigned Ranker::phrase(const string& phrase, const string& where,
1004 map<unsigned, unsigned>& occurrences)
1005 {
1006 unsigned value = 0;
1007 vector<string> words;
1008
1009 unsigned begin = 0, space;
1010 do
1011 {
1012 space = phrase.find(' ', begin);
1013
1014 words.push_back(phrase.substr(begin, space - begin));
1015
1016 begin = space + 1;
1017 }
1018 while (space != string::npos && begin < phrase.length());
1019
1020 begin = 0;
1021 do
1022 {
1023 value += this->phrase(words, 0, begin, true, where, occurrences);
1024 }
1025 while (begin < where.length());
1026
1027 return value;
1028 }
1029
1030 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1031 begin, bool start, const string& where)
1032 {
1033 unsigned value = 0;
1034 bool end = !(word + 1 < words.size());
1035 unsigned found = where.find(words[word], begin);
1036 unsigned newBegin = found + words[word].length();
1037
1038 if (found != string::npos)
1039 {
1040 bool isBefore, isAfter, before = false, after = false;
1041 isBefore = found - 1 > 0;
1042 isAfter = found + words[word].length() < where.length();
1043
1044 if (isBefore) before = isalnum(where[found - 1]) != 0;
1045 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1046
1047 if (!before && !after)
1048 {
1049 bool between = true;
1050 if (!start)
1051 {
1052 for (unsigned index = begin + 1; index < found - 1; index++)
1053 {
1054 if (isalnum(where[index]))
1055 {
1056 between = false;
1057 break;
1058 }
1059 }
1060 }
1061
1062 if (between)
1063 {
1064 if (end)
1065 {
1066 begin = newBegin;
1067 value = 1;
1068 }
1069 else
1070 {
1071 value = phrase(words, (word + 1), newBegin, false, where);
1072 }
1073 }
1074 }
1075 }
1076
1077 if (start)
1078 {
1079 if (found != string::npos)
1080 {
1081 begin = newBegin;
1082 }
1083 else
1084 {
1085 begin = string::npos;
1086 }
1087 }
1088
1089 return value;
1090 }
1091
1092 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1093 begin, bool start, const string& where, map<unsigned, unsigned>&
1094 occurrences)
1095 {
1096 unsigned value = 0;
1097 bool end = !(word + 1 < words.size());
1098 unsigned found = where.find(words[word], begin);
1099 unsigned newBegin = found + words[word].length();
1100
1101 if (found != string::npos)
1102 {
1103 bool isBefore, isAfter, before = false, after = false;
1104 isBefore = found - 1 > 0;
1105 isAfter = found + words[word].length() < where.length();
1106
1107 if (isBefore) before = isalnum(where[found - 1]) != 0;
1108 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1109
1110 if (!before && !after)
1111 {
1112 bool between = true;
1113 if (!start)
1114 {
1115 for (unsigned index = begin + 1; index < found - 1; index++)
1116 {
1117 if (isalnum(where[index]))
1118 {
1119 between = false;
1120 break;
1121 }
1122 }
1123 }
1124
1125 if (between)
1126 {
1127 occurrences.insert(pair<unsigned, unsigned>(found,
1128 words[word].length()));
1129
1130 if (end)
1131 {
1132 begin = newBegin;
1133 value = 1;
1134 }
1135 else
1136 {
1137 value = phrase(words, (word + 1), newBegin, false, where,
1138 occurrences);
1139 }
1140 }
1141 }
1142 }
1143
1144 if (start)
1145 {
1146 if (found != string::npos)
1147 {
1148 begin = newBegin;
1149 }
1150 else
1151 {
1152 begin = string::npos;
1153 }
1154 }
1155
1156 return value;
1157 }
1158
1159 unsigned Ranker::evaluate(vector<unsigned>& ins)
1160 {
1161 unsigned in = 0;
1162
1163 for (unsigned index = 0; index < ins.size(); index++)
1164 {
1165 if (ins[index] > 0)
1166 {
1167 in += ins[index];
1168 }
1169 else
1170 {
1171 in = 0;
1172 break;
1173 }
1174 }
1175
1176 return in;
1177 }
1178
1179 void Ranker::decrap(string& crap)
1180 {
1181 unsigned begin = 0, found;
1182 do
1183 {
1184 // &, _, +, and # are not considered crap
1185 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1186
1187 if (found != string::npos)
1188 {
1189 crap[found] = ' ';
1190 }
1191
1192 begin = found + 1;
1193 }
1194 while (found != string::npos && begin < crap.length());
1195
1196 normalize(crap);
1197 }

Properties

Name Value
svn:eol-style native
svn:keywords Id