ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 349
Committed: 2004-05-27T00:18:04-07:00 (21 years ago) by Douglas Thrift
File size: 26950 byte(s)
Log Message:
Even more C++ifying!

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Ranker
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Ranker.hpp"
52
53 void Ranker::rank(vector<string> query)
54 {
55 vector<string> prep;
56
57 for (size_t index(0); index < query.size(); index++)
58 {
59 if (query[index] == "allintitle:" && index == 0)
60 {
61 allIn = title;
62 }
63 else if (query[index] == "allinurl:" && index == 0)
64 {
65 allIn = url;
66 }
67 else if (query[index] == "allintext:" && index == 0)
68 {
69 allIn = text;
70 }
71 else if (query[index].find("site:") == 0 && query[index].size() > 5)
72 {
73 site = query[index].substr(5);
74 }
75 else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
76 {
77 prep.push_back("TITLE " + query[index].substr(8));
78 }
79 else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
80 {
81 prep.push_back("URL " + query[index].substr(6));
82 }
83 else if (query[index].find("intext:") == 0 && query[index].size() > 7)
84 {
85 prep.push_back("TEXT " + query[index].substr(7));
86 }
87 else
88 {
89 prep.push_back(query[index]);
90 }
91 }
92
93 if (prep.size() > 0)
94 {
95 bool or_(false);
96
97 for (size_t index(0); index < prep.size(); index++)
98 {
99 bool exclude(false);
100
101 if (prep[index].find('+') == 0)
102 {
103 prep[index].erase(0, 1);
104 }
105 else if (prep[index].find('-') == 0)
106 {
107 exclude = true;
108
109 prep[index].erase(0, 1);
110 }
111
112 if (or_)
113 {
114 if (prep[index].find(" OR") == string::npos)
115 {
116 or_ = false;
117 }
118
119 eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
120 }
121 else if (exclude)
122 {
123 excluded.push_back(prep[index]);
124 }
125 else if (prep[index].find(" OR") != string::npos)
126 {
127 or_ = true;
128
129 eitherOr.push_back(prep[index]);
130 }
131 else
132 {
133 required.push_back(prep[index]);
134 }
135 }
136 }
137
138 rank();
139 }
140
141 void Ranker::setSample()
142 {
143 map<unsigned, unsigned>::iterator itor;
144 multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
145
146 for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
147 {
148 unsigned distance;
149
150 if (++itor != occurrencesText.end())
151 {
152 unsigned next(itor->first);
153
154 itor--;
155
156 distance = next - (itor->first + itor->second);
157 }
158 else
159 {
160 distance = string::npos;
161
162 itor--;
163 }
164
165 distances.insert(pair<unsigned, map<unsigned,
166 unsigned>::iterator>(distance, itor));
167 }
168
169 if (distances.begin() != distances.end())
170 {
171 itor = distances.begin()->second;
172 }
173
174 string portion;
175 size_t sampleLength(0), begin(0), end(string::npos);
176
177 while (sampleLength < sampleMax && itor != occurrencesText.end())
178 {
179 unsigned found(itor->first), length(itor->second);
180
181 for (unsigned index(found); index > begin; index--)
182 {
183 if (found - index >= sampleMax - sampleLength - length)
184 {
185 while (index < found)
186 {
187 if (isspace(getText()[index++])) break;
188 }
189
190 begin = index;
191
192 break;
193 }
194 else if ((index > begin ? (isupper(getText()[index]) &&
195 !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
196 index != found)
197 {
198 begin = index;
199
200 break;
201 }
202 }
203
204 if (end + 1 != begin) sample += " <strong>...</strong> ";
205
206 portion = getText().substr(begin, found - begin);
207
208 sampleLength += portion.length();
209
210 entities(portion, '&', "&amp;");
211 entities(portion, '\"', "&quot;");
212 entities(portion, '<', "&lt;");
213 entities(portion, '>', "&gt;");
214
215 sample += portion + "<strong>";
216
217 portion = getText().substr(found, length);
218
219 sampleLength += portion.length();
220
221 entities(portion, '&', "&amp;");
222 entities(portion, '\"', "&quot;");
223 entities(portion, '<', "&lt;");
224 entities(portion, '>', "&gt;");
225
226 sample += portion + "</strong>";
227
228 begin = found + length;
229 end = begin - 1;
230
231 if (++itor != occurrencesText.end())
232 {
233 if (itor->first + itor->second < begin + sampleMax - sampleLength)
234 {
235 portion = getText().substr(begin, itor->first - begin);
236
237 sampleLength += portion.length();
238
239 entities(portion, '&', "&amp;");
240 entities(portion, '\"', "&quot;");
241 entities(portion, '<', "&lt;");
242 entities(portion, '>', "&gt;");
243
244 sample += portion;
245
246 begin = itor->first;
247 end = begin - 1;
248 }
249 else
250 {
251 for (end = begin + sampleMax - sampleLength; end > begin;
252 end--)
253 {
254 if (isspace(getText()[end])) break;
255 }
256
257 portion = getText().substr(begin, end - begin + 1);
258
259 sampleLength += portion.length();
260
261 entities(portion, '&', "&amp;");
262 entities(portion, '\"', "&quot;");
263 entities(portion, '<', "&lt;");
264 entities(portion, '>', "&gt;");
265
266 sample += portion + " <strong>...</strong>";
267
268 break;
269 }
270 }
271 else
272 {
273 for (end = begin + sampleMax - sampleLength; end > begin && (end +
274 1 < getText().length()); end--)
275 {
276 if (isspace(getText()[end])) break;
277 }
278
279 if (end >= getText().length()) end = getText().length() - 1;
280
281 portion = getText().substr(begin, end - begin + 1);
282
283 sampleLength += portion.length();
284
285 entities(portion, '&', "&amp;");
286 entities(portion, '\"', "&quot;");
287 entities(portion, '<', "&lt;");
288 entities(portion, '>', "&gt;");
289
290 sample += portion;
291
292 if (end + 1 < getText().length())
293 {
294 sample += " <strong>...</strong>";
295 }
296
297 break;
298 }
299 }
300
301 if (sample == "")
302 {
303 for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--)
304 {
305 if (isspace(getText()[end])) break;
306 }
307
308 sample = getText().substr(0, end + 1);
309
310 entities(sample, '&', "&amp;");
311 entities(sample, '\"', "&quot;");
312 entities(sample, '<', "&lt;");
313 entities(sample, '>', "&gt;");
314
315 if (end + 1 < getText().length())
316 {
317 sample += " <strong>...</strong>";
318 }
319 else if (sample == "")
320 {
321 sample = "<strong>...</strong>";
322 }
323 }
324 }
325
326 string Ranker::getTitle()
327 {
328 string title, portion;
329 size_t begin(0);
330
331 for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
332 itor != occurrencesTitle.end(); itor++)
333 {
334 unsigned found(itor->first), length(itor->second);
335
336 portion = Page::getTitle().substr(begin, found - begin);
337
338 entities(portion, '&', "&amp;");
339 entities(portion, '\"', "&quot;");
340 entities(portion, '<', "&lt;");
341 entities(portion, '>', "&gt;");
342
343 title += portion + "<strong>";
344
345 portion = Page::getTitle().substr(found, length);
346
347 entities(portion, '&', "&amp;");
348 entities(portion, '\"', "&quot;");
349 entities(portion, '<', "&lt;");
350 entities(portion, '>', "&gt;");
351
352 title += portion + "</strong>";
353
354 begin = found + length;
355 }
356
357 portion = Page::getTitle().substr(begin);
358
359 entities(portion, '&', "&amp;");
360 entities(portion, '\"', "&quot;");
361 entities(portion, '<', "&lt;");
362 entities(portion, '>', "&gt;");
363
364 title += portion;
365
366 return title;
367 }
368
369 string Ranker::getDescription()
370 {
371 string description, portion;
372 unsigned begin(0);
373
374 for (map<unsigned, unsigned>::iterator itor =
375 occurrencesDescription.begin(); itor != occurrencesDescription.end();
376 itor++)
377 {
378 unsigned found(itor->first), length(itor->second);
379
380 portion = Page::getDescription().substr(begin, found - begin);
381
382 entities(portion, '&', "&amp;");
383 entities(portion, '\"', "&quot;");
384 entities(portion, '<', "&lt;");
385 entities(portion, '>', "&gt;");
386
387 description += portion + "<strong>";
388
389 portion = Page::getDescription().substr(found, length);
390
391 entities(portion, '&', "&amp;");
392 entities(portion, '\"', "&quot;");
393 entities(portion, '<', "&lt;");
394 entities(portion, '>', "&gt;");
395
396 description += portion + "</strong>";
397
398 begin = found + length;
399 }
400
401 portion = Page::getDescription().substr(begin);
402
403 entities(portion, '&', "&amp;");
404 entities(portion, '\"', "&quot;");
405 entities(portion, '<', "&lt;");
406 entities(portion, '>', "&gt;");
407
408 description += portion;
409
410 return description;
411 }
412
413 bool Ranker::operator==(const unsigned number) const
414 {
415 return value == number;
416 }
417
418 bool Ranker::operator==(const Ranker& ranker) const
419 {
420 return value == ranker.value;
421 }
422
423 bool Ranker::operator!=(const unsigned number) const
424 {
425 return value != number;
426 }
427
428 bool Ranker::operator!=(const Ranker& ranker) const
429 {
430 return value != ranker.value;
431 }
432
433 bool Ranker::operator<(const unsigned number) const
434 {
435 return value < number;
436 }
437
438 bool Ranker::operator<(const Ranker& ranker) const
439 {
440 return value < ranker.value;
441 }
442
443 bool Ranker::operator>(const unsigned number) const
444 {
445 return value > number;
446 }
447
448 bool Ranker::operator >(const Ranker& ranker) const
449 {
450 return value > ranker.value;
451 }
452
453 void Ranker::rank()
454 {
455 lowerAddress = tolower(getAddress());
456
457 if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() -
458 site.length())
459 {
460 bool isRequired(required.size() > 0), isExcluded(excluded.size() > 0),
461 isEitherOr(eitherOr.size() > 0);
462
463 lowerURL = tolower(getURL());
464 lowerTitle = tolower(Page::getTitle());
465 lowerText = tolower(Page::getText());
466
467 if (isRequired) checkRequired();
468 if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
469 if (isEitherOr) checkEitherOr();
470
471 if (isRequired && isExcluded && isEitherOr)
472 {
473 value += requiredValue && !excludedValue && eitherOrValue ?
474 requiredValue + eitherOrValue : 0;
475 }
476 else if (isRequired && isExcluded)
477 {
478 value += requiredValue && !excludedValue ? requiredValue : 0;
479 }
480 else if (isRequired && isEitherOr)
481 {
482 value += requiredValue && eitherOrValue ? requiredValue +
483 eitherOrValue : 0;
484 }
485 else if (isExcluded && isEitherOr)
486 {
487 value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
488 }
489 else if (isRequired)
490 {
491 value += requiredValue;
492 }
493 else if (isEitherOr)
494 {
495 value += eitherOrValue;
496 }
497 else
498 {
499 // do nothing this is a bad search and warrants no results
500 }
501
502 if (value > 0)
503 {
504 string lowerDescription(tolower(Page::getDescription()));
505
506 for (unsigned index(0); index < required.size(); index++)
507 {
508 if (required[index].find("URL ") == 0)
509 {
510 value += find(required[index].substr(4), lowerDescription,
511 occurrencesDescription);
512 }
513 else if (required[index].find("TITLE ") == 0)
514 {
515 value += find(required[index].substr(6), lowerDescription,
516 occurrencesDescription);
517 }
518 else if (required[index].find("TEXT ") == 0)
519 {
520 value += find(required[index].substr(5), lowerDescription,
521 occurrencesDescription);
522 }
523 else
524 {
525 value += find(required[index], lowerDescription,
526 occurrencesDescription);
527 }
528 }
529
530 for (unsigned index1(0); index1 < eitherOr.size(); index1++)
531 {
532 vector<string> words;
533 unsigned begin(0), found;
534
535 do
536 {
537 found = eitherOr[index1].find(" OR ", begin);
538
539 if (found != string::npos)
540 {
541 words.push_back(eitherOr[index1].substr(begin, found -
542 begin));
543 }
544 else
545 {
546 words.push_back(eitherOr[index1].substr(begin));
547 }
548
549 begin = found + 4;
550 }
551 while (begin < eitherOr[index1].length() && found !=
552 string::npos);
553
554 for (unsigned number(0); number < words.size(); number++)
555 {
556 if (words[index1].find("URL ") == 0)
557 {
558 value += find(words[index1].substr(4),
559 lowerDescription, occurrencesDescription);
560 }
561 else if (words[index1].find("TITLE ") == 0)
562 {
563 value += find(words[index1].substr(6),
564 lowerDescription, occurrencesDescription);
565 }
566 else if (words[index1].find("TEXT ") == 0)
567 {
568 value += find(words[index1].substr(5),
569 lowerDescription, occurrencesDescription);
570 }
571 else
572 {
573 value += find(words[index1], lowerDescription,
574 occurrencesDescription);
575 }
576 }
577 }
578
579 for (unsigned index2(0); index2 < getHeadings().size(); index2++)
580 {
581 string lowerHeading = string(getHeadings()[index2].length(),
582 ' ');
583
584 for (unsigned number(0); number <
585 getHeadings()[index2].length(); number++)
586 {
587 lowerHeading[number] = tolower(
588 getHeadings()[index2][number]);
589 }
590
591 for (unsigned number0(0); number0 < required.size(); number0++)
592 {
593 if (required[number0].find("URL ") == 0)
594 {
595 value += find(required[number0].substr(4),
596 lowerHeading);
597 }
598 else if (required[number0].find("TITLE ") == 0)
599 {
600 value += find(required[number0].substr(6),
601 lowerHeading);
602 }
603 else if (required[number0].find("TEXT ") == 0)
604 {
605 value += find(required[number0].substr(5),
606 lowerHeading);
607 }
608 else
609 {
610 value += find(required[number0], lowerHeading);
611 }
612 }
613
614 for (unsigned number1(0); number1 < eitherOr.size(); number1++)
615 {
616 vector<string> words;
617 unsigned begin(0), found;
618
619 do
620 {
621 found = eitherOr[number1].find(" OR ", begin);
622
623 if (found != string::npos)
624 {
625 words.push_back(eitherOr[number1].substr(begin,
626 found - begin));
627 }
628 else
629 {
630 words.push_back(eitherOr[number1].substr(begin));
631 }
632
633 begin = found + 4;
634 }
635 while (begin < eitherOr[number1].length() && found !=
636 string::npos);
637
638 for (unsigned number(0); number < words.size(); number++)
639 {
640 if (words[number].find("URL ") == 0)
641 {
642 value += find(words[number].substr(4),
643 lowerHeading);
644 }
645 else if (words[number].find("TITLE ") == 0)
646 {
647 value += find(words[number].substr(6),
648 lowerHeading);
649 }
650 else if (words[number].find("TEXT ") == 0)
651 {
652 value += find(words[number].substr(5),
653 lowerHeading);
654 }
655 else
656 {
657 value += find(words[number], lowerHeading);
658 }
659 }
660 }
661 }
662 }
663 }
664 }
665
666 void Ranker::checkRequired()
667 {
668 vector<unsigned> inURLs, inTitles, inTexts;
669
670 for (unsigned index(0); index < required.size(); index++)
671 {
672 unsigned inURL(0), inTitle(0), inText(0);
673
674 if (required[index].find("URL ") == 0)
675 {
676 inURL = find(required[index].substr(4), lowerURL.substr(7));
677
678 if (inURL)
679 {
680 inTitle = find(required[index].substr(4), lowerTitle,
681 occurrencesTitle);
682 inText = find(required[index].substr(4), lowerText,
683 occurrencesText);
684
685 if (!inTitle) inTitle++;
686 if (!inText) inText++;
687 }
688 }
689 else if (required[index].find("TITLE ") == 0)
690 {
691 inTitle = find(required[index].substr(6), lowerTitle,
692 occurrencesTitle);
693
694 if (inTitle)
695 {
696 inURL = find(required[index].substr(6), lowerURL.substr(7));
697 inText = find(required[index].substr(6), lowerText,
698 occurrencesText);
699
700 if (!inURL) inURL++;
701 if (!inText) inText++;
702 }
703 }
704 else if (required[index].find("TEXT ") == 0)
705 {
706 inText = find(required[index].substr(5), lowerText,
707 occurrencesText);
708
709 if (inText)
710 {
711 inURL = find(required[index].substr(5), lowerURL.substr(7));
712 inTitle = find(required[index].substr(5), lowerTitle,
713 occurrencesTitle);
714
715 if (!inURL) inURL++;
716 if (!inTitle) inTitle++;
717 }
718 }
719 else
720 {
721 inURL = find(required[index], lowerURL.substr(7));
722 inTitle = find(required[index], lowerTitle, occurrencesTitle);
723 inText = find(required[index], lowerText, occurrencesText);
724 }
725
726 inURLs.push_back(inURL);
727 inTitles.push_back(inTitle);
728 inTexts.push_back(inText);
729 }
730
731 unsigned inURL(evaluate(inURLs)), inTitle(evaluate(inTitles)),
732 inText(evaluate(inTexts));
733
734 requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
735 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
736 inText : 0;
737 }
738
739 void Ranker::checkExcluded()
740 {
741 vector<unsigned> inURLs, inTitles, inTexts;
742
743 for (unsigned index(0); index < excluded.size(); index++)
744 {
745 unsigned inURL(0), inTitle(0), inText(0);
746
747 inURL = find(excluded[index], lowerURL.substr(7));
748 inTitle = find(excluded[index], lowerTitle);
749 inText = find(excluded[index], lowerText);
750
751 inURLs.push_back(inURL);
752 inTitles.push_back(inTitle);
753 inTexts.push_back(inText);
754 }
755
756 unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles),
757 inText(evaluate(inTexts));
758
759 excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
760 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
761 inText : 0;
762 }
763
764 void Ranker::checkEitherOr()
765 {
766 vector<unsigned> inURLs, inTitles, inTexts;
767
768 for (unsigned index(0); index < eitherOr.size(); index++)
769 {
770 vector<unsigned> inURLz, inTitlez, inTextz;
771 unsigned inURL(0), inTitle(0), inText(0);
772 vector<string> words;
773 unsigned begin(0), found;
774
775 do
776 {
777 found = eitherOr[index].find(" OR ", begin);
778
779 if (found != string::npos)
780 {
781 words.push_back(eitherOr[index].substr(begin, found - begin));
782 }
783 else
784 {
785 words.push_back(eitherOr[index].substr(begin));
786 }
787
788 begin = found + 4;
789 }
790 while (begin < eitherOr[index].length() && found != string::npos);
791
792 for (unsigned number(0); number < words.size(); number++)
793 {
794 unsigned inURL(0), inTitle(0), inText(0);
795
796 if (words[number].find("URL ") == 0)
797 {
798 inURL = find(words[number].substr(4), lowerURL.substr(7));
799
800 if (inURL)
801 {
802 inTitle = find(words[number].substr(4), lowerTitle,
803 occurrencesTitle);
804 inText = find(words[number].substr(4), lowerText,
805 occurrencesText);
806
807 if (!inTitle) inTitle++;
808 if (!inText) inText++;
809 }
810 }
811 else if (words[number].find("TITLE ") == 0)
812 {
813 inTitle = find(words[number].substr(6), lowerTitle,
814 occurrencesTitle);
815
816 if (inTitle)
817 {
818 inURL = find(words[number].substr(6), lowerURL.substr(7));
819 inText = find(words[number].substr(6), lowerText,
820 occurrencesText);
821
822 if (!inURL) inURL++;
823 if (!inText) inText++;
824 }
825 }
826 else if (words[number].find("TEXT ") == 0)
827 {
828 inText = find(words[number].substr(5), lowerText,
829 occurrencesText);
830
831 if (inText)
832 {
833 inURL = find(words[number].substr(5), lowerURL.substr(7));
834 inTitle = find(words[number].substr(5), lowerTitle,
835 occurrencesTitle);
836
837 if (!inURL) inURL++;
838 if (!inTitle) inTitle++;
839 }
840 }
841 else
842 {
843 inURL = find(words[number], lowerURL.substr(7));
844 inTitle = find(words[number], lowerTitle, occurrencesTitle);
845 inText = find(words[number], lowerText, occurrencesText);
846 }
847
848 inURLz.push_back(inURL);
849 inTitlez.push_back(inTitle);
850 inTextz.push_back(inText);
851 }
852
853 for (unsigned number0(0); number0 < inURLz.size(); number0++)
854 {
855 inURL += inURLz[number0];
856 }
857
858 for (unsigned number1(0); number1 < inTitlez.size(); number1++)
859 {
860 inTitle += inTitlez[number1];
861 }
862
863 for (unsigned number2(0); number2 < inTextz.size(); number2++)
864 {
865 inText += inTextz[number2];
866 }
867
868 inURLs.push_back(inURL);
869 inTitles.push_back(inTitle);
870 inTexts.push_back(inText);
871
872 inURLz.clear();
873 inTitlez.clear();
874 inTextz.clear();
875 words.clear();
876 }
877
878 unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles),
879 inText(evaluate(inTexts));
880
881 eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
882 || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
883 inText : 0;
884 }
885
886 unsigned Ranker::find(string word, const string& where)
887 {
888 unsigned value(0);
889
890 decrap(word);
891
892 if (word == "")
893 {
894 // this can happen if a word is all crap characters
895 value++;
896 }
897 else if (word.find_first_of(" \n\t") == string::npos)
898 {
899 unsigned begin(0), found;
900
901 do
902 {
903 found = where.find(word, begin);
904
905 if (found != string::npos)
906 {
907 bool isBefore, isAfter, before(false), after(false);
908
909 isBefore = found - 1 > 0;
910 isAfter = found + word.length() < where.length();
911
912 if (isBefore) before = isalnum(where[found - 1]) != 0;
913 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
914
915 if (!before && !after)
916 {
917 value++;
918 }
919 }
920
921 begin = found + word.length();
922 }
923 while (found != string::npos && begin < where.length());
924 }
925 else
926 {
927 value = phrase(word, where);
928 }
929
930 return value;
931 }
932
933 unsigned Ranker::find(string word, const string& where, map<unsigned,
934 unsigned>& occurrences)
935 {
936 unsigned value(0);
937
938 decrap(word);
939
940 if (word == "")
941 {
942 // this can happen if a word is all crap characters
943 value++;
944 }
945 else if (word.find_first_of(" \n ") == string::npos)
946 {
947 unsigned begin(0), found;
948
949 do
950 {
951 found = where.find(word, begin);
952
953 if (found != string::npos)
954 {
955 bool isBefore, isAfter, before(false), after(false);
956
957 isBefore = found - 1 > 0;
958 isAfter = found + word.length() < where.length();
959
960 if (isBefore) before = isalnum(where[found - 1]) != 0;
961 if (isAfter) after = isalnum(where[found + word.length()]) != 0;
962
963 if (!before && !after)
964 {
965 value++;
966
967 occurrences.insert(pair<unsigned, unsigned>(found,
968 word.length()));
969 }
970 }
971
972 begin = found + word.length();
973 }
974 while (found != string::npos && begin < where.length());
975 }
976 else
977 {
978 value = phrase(word, where, occurrences);
979 }
980
981 return value;
982 }
983
984 unsigned Ranker::phrase(const string& phrase, const string& where)
985 {
986 unsigned value(0);
987 vector<string> words;
988 unsigned begin(0), space;
989
990 do
991 {
992 space = phrase.find(' ', begin);
993
994 words.push_back(phrase.substr(begin, space - begin));
995
996 begin = space + 1;
997 }
998 while (space != string::npos && begin < phrase.length());
999
1000 begin = 0;
1001
1002 unsigned counter(0);
1003
1004 do
1005 {
1006 value += this->phrase(words, 0, begin, true, where);
1007 }
1008 while (begin < where.length());
1009
1010 return value;
1011 }
1012
1013 unsigned Ranker::phrase(const string& phrase, const string& where,
1014 map<unsigned, unsigned>& occurrences)
1015 {
1016 unsigned value(0);
1017 vector<string> words;
1018 unsigned begin(0), space;
1019
1020 do
1021 {
1022 space = phrase.find(' ', begin);
1023
1024 words.push_back(phrase.substr(begin, space - begin));
1025
1026 begin = space + 1;
1027 }
1028 while (space != string::npos && begin < phrase.length());
1029
1030 begin = 0;
1031
1032 do
1033 {
1034 value += this->phrase(words, 0, begin, true, where, occurrences);
1035 }
1036 while (begin < where.length());
1037
1038 return value;
1039 }
1040
1041 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1042 begin, bool start, const string& where)
1043 {
1044 unsigned value(0);
1045 bool end(!(word + 1 < words.size()));
1046 unsigned found(where.find(words[word], begin)), newBegin(found +
1047 words[word].length());
1048
1049 if (found != string::npos)
1050 {
1051 bool isBefore, isAfter, before(false), after(false);
1052
1053 isBefore = found - 1 > 0;
1054 isAfter = found + words[word].length() < where.length();
1055
1056 if (isBefore) before = isalnum(where[found - 1]) != 0;
1057 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1058
1059 if (!before && !after)
1060 {
1061 bool between(true);
1062
1063 if (!start)
1064 {
1065 for (unsigned index = begin + 1; index < found - 1; index++)
1066 {
1067 if (isalnum(where[index]))
1068 {
1069 between = false;
1070 break;
1071 }
1072 }
1073 }
1074
1075 if (between)
1076 {
1077 if (end)
1078 {
1079 begin = newBegin;
1080 value = 1;
1081 }
1082 else
1083 {
1084 value = phrase(words, (word + 1), newBegin, false, where);
1085 }
1086 }
1087 }
1088 }
1089
1090 if (start)
1091 {
1092 if (found != string::npos)
1093 {
1094 begin = newBegin;
1095 }
1096 else
1097 {
1098 begin = string::npos;
1099 }
1100 }
1101
1102 return value;
1103 }
1104
1105 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1106 begin, bool start, const string& where, map<unsigned, unsigned>&
1107 occurrences)
1108 {
1109 unsigned value(0);
1110 bool end(!(word + 1 < words.size()));
1111 unsigned found(where.find(words[word], begin)), newBegin(found +
1112 words[word].length());
1113
1114 if (found != string::npos)
1115 {
1116 bool isBefore, isAfter, before(false), after(false);
1117
1118 isBefore = found - 1 > 0;
1119 isAfter = found + words[word].length() < where.length();
1120
1121 if (isBefore) before = isalnum(where[found - 1]) != 0;
1122 if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1123
1124 if (!before && !after)
1125 {
1126 bool between(true);
1127
1128 if (!start)
1129 {
1130 for (unsigned index = begin + 1; index < found - 1; index++)
1131 {
1132 if (isalnum(where[index]))
1133 {
1134 between = false;
1135 break;
1136 }
1137 }
1138 }
1139
1140 if (between)
1141 {
1142 occurrences.insert(pair<unsigned, unsigned>(found,
1143 words[word].length()));
1144
1145 if (end)
1146 {
1147 begin = newBegin;
1148 value = 1;
1149 }
1150 else
1151 {
1152 value = phrase(words, (word + 1), newBegin, false, where,
1153 occurrences);
1154 }
1155 }
1156 }
1157 }
1158
1159 if (start)
1160 {
1161 if (found != string::npos)
1162 {
1163 begin = newBegin;
1164 }
1165 else
1166 {
1167 begin = string::npos;
1168 }
1169 }
1170
1171 return value;
1172 }
1173
1174 unsigned Ranker::evaluate(vector<unsigned>& ins)
1175 {
1176 unsigned in(0);
1177
1178 for (unsigned index(0); index < ins.size(); index++)
1179 {
1180 if (ins[index] > 0)
1181 {
1182 in += ins[index];
1183 }
1184 else
1185 {
1186 in = 0;
1187 break;
1188 }
1189 }
1190
1191 return in;
1192 }
1193
1194 void Ranker::decrap(string& crap)
1195 {
1196 unsigned begin(0), found;
1197
1198 do
1199 {
1200 // &, _, +, and # are not considered crap
1201 found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1202
1203 if (found != string::npos)
1204 {
1205 crap[found] = ' ';
1206 }
1207
1208 begin = found + 1;
1209 }
1210 while (found != string::npos && begin < crap.length());
1211
1212 normalize(crap);
1213 }

Properties

Name Value
svn:eol-style native
svn:keywords Id