ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Ranker.cpp
Revision: 355
Committed: 2004-06-04T04:08:28-07:00 (21 years ago) by Douglas Thrift
File size: 26960 byte(s)
Log Message:
I missed some C++ifying!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Ranker
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Ranker.hpp"
52 douglas 1
53     void Ranker::rank(vector<string> query)
54     {
55     vector<string> prep;
56    
57 Douglas Thrift 349 for (size_t index(0); index < query.size(); index++)
58 douglas 1 {
59     if (query[index] == "allintitle:" && index == 0)
60     {
61     allIn = title;
62     }
63     else if (query[index] == "allinurl:" && index == 0)
64     {
65     allIn = url;
66     }
67     else if (query[index] == "allintext:" && index == 0)
68     {
69     allIn = text;
70     }
71     else if (query[index].find("site:") == 0 && query[index].size() > 5)
72     {
73     site = query[index].substr(5);
74     }
75     else if (query[index].find("intitle:") == 0 && query[index].size() > 8)
76     {
77     prep.push_back("TITLE " + query[index].substr(8));
78     }
79     else if (query[index].find("inurl:") == 0 && query[index].size() > 6)
80     {
81     prep.push_back("URL " + query[index].substr(6));
82     }
83     else if (query[index].find("intext:") == 0 && query[index].size() > 7)
84     {
85     prep.push_back("TEXT " + query[index].substr(7));
86     }
87     else
88     {
89     prep.push_back(query[index]);
90     }
91     }
92    
93     if (prep.size() > 0)
94     {
95 Douglas Thrift 349 bool or_(false);
96    
97     for (size_t index(0); index < prep.size(); index++)
98 douglas 1 {
99 Douglas Thrift 349 bool exclude(false);
100    
101 douglas 1 if (prep[index].find('+') == 0)
102     {
103     prep[index].erase(0, 1);
104     }
105     else if (prep[index].find('-') == 0)
106     {
107     exclude = true;
108 Douglas Thrift 349
109 douglas 1 prep[index].erase(0, 1);
110     }
111    
112     if (or_)
113     {
114     if (prep[index].find(" OR") == string::npos)
115     {
116     or_ = false;
117     }
118    
119     eitherOr[eitherOr.size() - 1] += ' ' + prep[index];
120     }
121     else if (exclude)
122     {
123     excluded.push_back(prep[index]);
124     }
125     else if (prep[index].find(" OR") != string::npos)
126     {
127     or_ = true;
128 Douglas Thrift 349
129 douglas 1 eitherOr.push_back(prep[index]);
130     }
131     else
132     {
133     required.push_back(prep[index]);
134     }
135     }
136     }
137    
138     rank();
139     }
140    
141     void Ranker::setSample()
142     {
143     map<unsigned, unsigned>::iterator itor;
144     multimap<unsigned, map<unsigned, unsigned>::iterator> distances;
145    
146     for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++)
147     {
148     unsigned distance;
149    
150     if (++itor != occurrencesText.end())
151     {
152 Douglas Thrift 349 unsigned next(itor->first);
153 Douglas Thrift 341
154 douglas 1 itor--;
155 Douglas Thrift 349
156 douglas 1 distance = next - (itor->first + itor->second);
157     }
158     else
159     {
160 Douglas Thrift 339 distance = string::npos;
161 Douglas Thrift 349
162 douglas 1 itor--;
163     }
164    
165     distances.insert(pair<unsigned, map<unsigned,
166     unsigned>::iterator>(distance, itor));
167     }
168    
169     if (distances.begin() != distances.end())
170     {
171     itor = distances.begin()->second;
172     }
173    
174     string portion;
175 Douglas Thrift 349 size_t sampleLength(0), begin(0), end(string::npos);
176 Douglas Thrift 341
177 Douglas Thrift 340 while (sampleLength < sampleMax && itor != occurrencesText.end())
178 douglas 1 {
179 Douglas Thrift 349 unsigned found(itor->first), length(itor->second);
180 douglas 1
181 Douglas Thrift 349 for (unsigned index(found); index > begin; index--)
182 douglas 1 {
183 Douglas Thrift 340 if (found - index >= sampleMax - sampleLength - length)
184 douglas 1 {
185 Douglas Thrift 349 while (index < found)
186 douglas 1 {
187 Douglas Thrift 349 if (isspace(getText()[index++])) break;
188 douglas 1 }
189 Douglas Thrift 341
190 Douglas Thrift 349 begin = index;
191 Douglas Thrift 341
192 douglas 1 break;
193     }
194     else if ((index > begin ? (isupper(getText()[index]) &&
195     !isalnum(getText()[index - 1])) : isupper(getText()[index])) &&
196     index != found)
197     {
198     begin = index;
199 Douglas Thrift 341
200 douglas 1 break;
201     }
202     }
203    
204     if (end + 1 != begin) sample += " <strong>...</strong> ";
205    
206     portion = getText().substr(begin, found - begin);
207 Douglas Thrift 349
208 douglas 1 sampleLength += portion.length();
209    
210     entities(portion, '&', "&amp;");
211     entities(portion, '\"', "&quot;");
212     entities(portion, '<', "&lt;");
213     entities(portion, '>', "&gt;");
214    
215     sample += portion + "<strong>";
216 Douglas Thrift 349
217 douglas 1 portion = getText().substr(found, length);
218 Douglas Thrift 349
219 douglas 1 sampleLength += portion.length();
220    
221     entities(portion, '&', "&amp;");
222     entities(portion, '\"', "&quot;");
223     entities(portion, '<', "&lt;");
224     entities(portion, '>', "&gt;");
225    
226     sample += portion + "</strong>";
227 Douglas Thrift 349
228 douglas 1 begin = found + length;
229     end = begin - 1;
230    
231     if (++itor != occurrencesText.end())
232     {
233 Douglas Thrift 340 if (itor->first + itor->second < begin + sampleMax - sampleLength)
234 douglas 1 {
235     portion = getText().substr(begin, itor->first - begin);
236 Douglas Thrift 349
237 douglas 1 sampleLength += portion.length();
238    
239     entities(portion, '&', "&amp;");
240     entities(portion, '\"', "&quot;");
241     entities(portion, '<', "&lt;");
242     entities(portion, '>', "&gt;");
243    
244     sample += portion;
245 Douglas Thrift 349
246 douglas 1 begin = itor->first;
247     end = begin - 1;
248     }
249     else
250     {
251 Douglas Thrift 340 for (end = begin + sampleMax - sampleLength; end > begin;
252     end--)
253 douglas 1 {
254     if (isspace(getText()[end])) break;
255     }
256    
257     portion = getText().substr(begin, end - begin + 1);
258 Douglas Thrift 349
259 douglas 1 sampleLength += portion.length();
260    
261     entities(portion, '&', "&amp;");
262     entities(portion, '\"', "&quot;");
263     entities(portion, '<', "&lt;");
264     entities(portion, '>', "&gt;");
265    
266     sample += portion + " <strong>...</strong>";
267    
268     break;
269     }
270     }
271     else
272     {
273 Douglas Thrift 340 for (end = begin + sampleMax - sampleLength; end > begin && (end +
274     1 < getText().length()); end--)
275 douglas 1 {
276     if (isspace(getText()[end])) break;
277     }
278    
279     if (end >= getText().length()) end = getText().length() - 1;
280    
281     portion = getText().substr(begin, end - begin + 1);
282 Douglas Thrift 349
283 douglas 1 sampleLength += portion.length();
284    
285     entities(portion, '&', "&amp;");
286     entities(portion, '\"', "&quot;");
287     entities(portion, '<', "&lt;");
288     entities(portion, '>', "&gt;");
289    
290     sample += portion;
291    
292     if (end + 1 < getText().length())
293     {
294     sample += " <strong>...</strong>";
295     }
296    
297     break;
298     }
299     }
300 Douglas Thrift 341
301 Douglas Thrift 355 if (sample.empty())
302 Douglas Thrift 341 {
303     for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--)
304     {
305     if (isspace(getText()[end])) break;
306     }
307    
308     sample = getText().substr(0, end + 1);
309    
310     entities(sample, '&', "&amp;");
311     entities(sample, '\"', "&quot;");
312     entities(sample, '<', "&lt;");
313     entities(sample, '>', "&gt;");
314    
315     if (end + 1 < getText().length())
316     {
317     sample += " <strong>...</strong>";
318     }
319 Douglas Thrift 355 else if (sample.empty())
320 Douglas Thrift 342 {
321     sample = "<strong>...</strong>";
322     }
323 Douglas Thrift 341 }
324 douglas 1 }
325    
326     string Ranker::getTitle()
327     {
328     string title, portion;
329 Douglas Thrift 349 size_t begin(0);
330 douglas 1
331     for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin();
332     itor != occurrencesTitle.end(); itor++)
333     {
334 Douglas Thrift 349 unsigned found(itor->first), length(itor->second);
335 douglas 1
336     portion = Page::getTitle().substr(begin, found - begin);
337    
338     entities(portion, '&', "&amp;");
339     entities(portion, '\"', "&quot;");
340     entities(portion, '<', "&lt;");
341     entities(portion, '>', "&gt;");
342    
343     title += portion + "<strong>";
344    
345     portion = Page::getTitle().substr(found, length);
346    
347     entities(portion, '&', "&amp;");
348     entities(portion, '\"', "&quot;");
349     entities(portion, '<', "&lt;");
350     entities(portion, '>', "&gt;");
351    
352     title += portion + "</strong>";
353    
354     begin = found + length;
355     }
356    
357     portion = Page::getTitle().substr(begin);
358    
359     entities(portion, '&', "&amp;");
360     entities(portion, '\"', "&quot;");
361     entities(portion, '<', "&lt;");
362     entities(portion, '>', "&gt;");
363    
364     title += portion;
365    
366     return title;
367     }
368    
369     string Ranker::getDescription()
370     {
371     string description, portion;
372 Douglas Thrift 349 unsigned begin(0);
373 douglas 1
374     for (map<unsigned, unsigned>::iterator itor =
375     occurrencesDescription.begin(); itor != occurrencesDescription.end();
376     itor++)
377     {
378 Douglas Thrift 349 unsigned found(itor->first), length(itor->second);
379 douglas 1
380     portion = Page::getDescription().substr(begin, found - begin);
381    
382     entities(portion, '&', "&amp;");
383     entities(portion, '\"', "&quot;");
384     entities(portion, '<', "&lt;");
385     entities(portion, '>', "&gt;");
386    
387     description += portion + "<strong>";
388    
389     portion = Page::getDescription().substr(found, length);
390    
391     entities(portion, '&', "&amp;");
392     entities(portion, '\"', "&quot;");
393     entities(portion, '<', "&lt;");
394     entities(portion, '>', "&gt;");
395    
396     description += portion + "</strong>";
397    
398     begin = found + length;
399     }
400    
401     portion = Page::getDescription().substr(begin);
402    
403     entities(portion, '&', "&amp;");
404     entities(portion, '\"', "&quot;");
405     entities(portion, '<', "&lt;");
406     entities(portion, '>', "&gt;");
407    
408     description += portion;
409    
410     return description;
411     }
412    
413     bool Ranker::operator==(const unsigned number) const
414     {
415     return value == number;
416     }
417    
418     bool Ranker::operator==(const Ranker& ranker) const
419     {
420     return value == ranker.value;
421     }
422    
423     bool Ranker::operator!=(const unsigned number) const
424     {
425     return value != number;
426     }
427    
428     bool Ranker::operator!=(const Ranker& ranker) const
429     {
430     return value != ranker.value;
431     }
432    
433     bool Ranker::operator<(const unsigned number) const
434     {
435     return value < number;
436     }
437    
438     bool Ranker::operator<(const Ranker& ranker) const
439     {
440     return value < ranker.value;
441     }
442    
443     bool Ranker::operator>(const unsigned number) const
444     {
445     return value > number;
446     }
447    
448     bool Ranker::operator >(const Ranker& ranker) const
449     {
450     return value > ranker.value;
451     }
452    
453     void Ranker::rank()
454     {
455 douglas 211 lowerAddress = tolower(getAddress());
456 douglas 1
457 Douglas Thrift 355 if (site.empty() || lowerAddress.rfind(site) == lowerAddress.length() -
458 douglas 1 site.length())
459     {
460 Douglas Thrift 349 bool isRequired(required.size() > 0), isExcluded(excluded.size() > 0),
461     isEitherOr(eitherOr.size() > 0);
462 douglas 1
463 douglas 211 lowerURL = tolower(getURL());
464     lowerTitle = tolower(Page::getTitle());
465     lowerText = tolower(Page::getText());
466 douglas 1
467     if (isRequired) checkRequired();
468     if (isExcluded && (isRequired || isEitherOr)) checkExcluded();
469     if (isEitherOr) checkEitherOr();
470    
471     if (isRequired && isExcluded && isEitherOr)
472     {
473     value += requiredValue && !excludedValue && eitherOrValue ?
474     requiredValue + eitherOrValue : 0;
475     }
476     else if (isRequired && isExcluded)
477     {
478     value += requiredValue && !excludedValue ? requiredValue : 0;
479     }
480     else if (isRequired && isEitherOr)
481     {
482     value += requiredValue && eitherOrValue ? requiredValue +
483     eitherOrValue : 0;
484     }
485     else if (isExcluded && isEitherOr)
486     {
487     value += !excludedValue && eitherOrValue ? eitherOrValue : 0;
488     }
489     else if (isRequired)
490     {
491     value += requiredValue;
492     }
493     else if (isEitherOr)
494     {
495     value += eitherOrValue;
496     }
497     else
498     {
499     // do nothing this is a bad search and warrants no results
500     }
501    
502     if (value > 0)
503     {
504 Douglas Thrift 349 string lowerDescription(tolower(Page::getDescription()));
505 douglas 1
506 Douglas Thrift 349 for (unsigned index(0); index < required.size(); index++)
507 douglas 1 {
508 douglas 211 if (required[index].find("URL ") == 0)
509 douglas 1 {
510 douglas 211 value += find(required[index].substr(4), lowerDescription,
511 douglas 1 occurrencesDescription);
512     }
513 douglas 211 else if (required[index].find("TITLE ") == 0)
514 douglas 1 {
515 douglas 211 value += find(required[index].substr(6), lowerDescription,
516 douglas 1 occurrencesDescription);
517     }
518 douglas 211 else if (required[index].find("TEXT ") == 0)
519 douglas 1 {
520 douglas 211 value += find(required[index].substr(5), lowerDescription,
521 douglas 1 occurrencesDescription);
522     }
523     else
524     {
525 douglas 211 value += find(required[index], lowerDescription,
526 douglas 1 occurrencesDescription);
527     }
528     }
529    
530 Douglas Thrift 349 for (unsigned index1(0); index1 < eitherOr.size(); index1++)
531 douglas 1 {
532     vector<string> words;
533 Douglas Thrift 349 unsigned begin(0), found;
534 douglas 1
535     do
536     {
537     found = eitherOr[index1].find(" OR ", begin);
538    
539     if (found != string::npos)
540     {
541     words.push_back(eitherOr[index1].substr(begin, found -
542     begin));
543     }
544     else
545     {
546     words.push_back(eitherOr[index1].substr(begin));
547     }
548    
549     begin = found + 4;
550     }
551     while (begin < eitherOr[index1].length() && found !=
552     string::npos);
553    
554 Douglas Thrift 349 for (unsigned number(0); number < words.size(); number++)
555 douglas 1 {
556     if (words[index1].find("URL ") == 0)
557     {
558 douglas 15 value += find(words[index1].substr(4),
559     lowerDescription, occurrencesDescription);
560 douglas 1 }
561     else if (words[index1].find("TITLE ") == 0)
562     {
563 douglas 15 value += find(words[index1].substr(6),
564     lowerDescription, occurrencesDescription);
565 douglas 1 }
566     else if (words[index1].find("TEXT ") == 0)
567     {
568 douglas 15 value += find(words[index1].substr(5),
569     lowerDescription, occurrencesDescription);
570 douglas 1 }
571     else
572     {
573     value += find(words[index1], lowerDescription,
574     occurrencesDescription);
575     }
576     }
577     }
578    
579 Douglas Thrift 349 for (unsigned index2(0); index2 < getHeadings().size(); index2++)
580 douglas 1 {
581     string lowerHeading = string(getHeadings()[index2].length(),
582     ' ');
583 Douglas Thrift 349
584     for (unsigned number(0); number <
585 douglas 1 getHeadings()[index2].length(); number++)
586     {
587     lowerHeading[number] = tolower(
588     getHeadings()[index2][number]);
589     }
590    
591 Douglas Thrift 349 for (unsigned number0(0); number0 < required.size(); number0++)
592 douglas 1 {
593     if (required[number0].find("URL ") == 0)
594     {
595 douglas 15 value += find(required[number0].substr(4),
596 douglas 1 lowerHeading);
597     }
598     else if (required[number0].find("TITLE ") == 0)
599     {
600 douglas 15 value += find(required[number0].substr(6),
601 douglas 1 lowerHeading);
602     }
603     else if (required[number0].find("TEXT ") == 0)
604     {
605 douglas 15 value += find(required[number0].substr(5),
606 douglas 1 lowerHeading);
607     }
608     else
609     {
610     value += find(required[number0], lowerHeading);
611     }
612     }
613    
614 Douglas Thrift 349 for (unsigned number1(0); number1 < eitherOr.size(); number1++)
615 douglas 1 {
616     vector<string> words;
617 Douglas Thrift 349 unsigned begin(0), found;
618 douglas 1
619     do
620     {
621     found = eitherOr[number1].find(" OR ", begin);
622    
623     if (found != string::npos)
624     {
625     words.push_back(eitherOr[number1].substr(begin,
626     found - begin));
627     }
628     else
629     {
630     words.push_back(eitherOr[number1].substr(begin));
631     }
632    
633     begin = found + 4;
634     }
635     while (begin < eitherOr[number1].length() && found !=
636     string::npos);
637    
638 Douglas Thrift 349 for (unsigned number(0); number < words.size(); number++)
639 douglas 1 {
640     if (words[number].find("URL ") == 0)
641     {
642 douglas 15 value += find(words[number].substr(4),
643 douglas 1 lowerHeading);
644     }
645     else if (words[number].find("TITLE ") == 0)
646     {
647 douglas 15 value += find(words[number].substr(6),
648 douglas 1 lowerHeading);
649     }
650     else if (words[number].find("TEXT ") == 0)
651     {
652 douglas 15 value += find(words[number].substr(5),
653 douglas 1 lowerHeading);
654     }
655     else
656     {
657     value += find(words[number], lowerHeading);
658     }
659     }
660     }
661     }
662     }
663     }
664     }
665    
666     void Ranker::checkRequired()
667     {
668     vector<unsigned> inURLs, inTitles, inTexts;
669    
670 Douglas Thrift 349 for (unsigned index(0); index < required.size(); index++)
671 douglas 1 {
672 Douglas Thrift 349 unsigned inURL(0), inTitle(0), inText(0);
673 douglas 1
674     if (required[index].find("URL ") == 0)
675     {
676 douglas 15 inURL = find(required[index].substr(4), lowerURL.substr(7));
677 douglas 1
678     if (inURL)
679     {
680 douglas 15 inTitle = find(required[index].substr(4), lowerTitle,
681 douglas 1 occurrencesTitle);
682 douglas 15 inText = find(required[index].substr(4), lowerText,
683 douglas 1 occurrencesText);
684    
685     if (!inTitle) inTitle++;
686     if (!inText) inText++;
687     }
688     }
689     else if (required[index].find("TITLE ") == 0)
690     {
691 douglas 15 inTitle = find(required[index].substr(6), lowerTitle,
692 douglas 1 occurrencesTitle);
693    
694     if (inTitle)
695     {
696 douglas 15 inURL = find(required[index].substr(6), lowerURL.substr(7));
697     inText = find(required[index].substr(6), lowerText,
698 douglas 1 occurrencesText);
699    
700     if (!inURL) inURL++;
701     if (!inText) inText++;
702     }
703     }
704     else if (required[index].find("TEXT ") == 0)
705     {
706 douglas 15 inText = find(required[index].substr(5), lowerText,
707 douglas 1 occurrencesText);
708    
709     if (inText)
710     {
711 douglas 15 inURL = find(required[index].substr(5), lowerURL.substr(7));
712     inTitle = find(required[index].substr(5), lowerTitle,
713 douglas 1 occurrencesTitle);
714    
715     if (!inURL) inURL++;
716     if (!inTitle) inTitle++;
717     }
718     }
719     else
720     {
721 douglas 15 inURL = find(required[index], lowerURL.substr(7));
722 douglas 1 inTitle = find(required[index], lowerTitle, occurrencesTitle);
723     inText = find(required[index], lowerText, occurrencesText);
724     }
725    
726     inURLs.push_back(inURL);
727     inTitles.push_back(inTitle);
728     inTexts.push_back(inText);
729     }
730    
731 Douglas Thrift 349 unsigned inURL(evaluate(inURLs)), inTitle(evaluate(inTitles)),
732     inText(evaluate(inTexts));
733 douglas 1
734     requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
735     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
736     inText : 0;
737     }
738    
739     void Ranker::checkExcluded()
740     {
741     vector<unsigned> inURLs, inTitles, inTexts;
742    
743 Douglas Thrift 349 for (unsigned index(0); index < excluded.size(); index++)
744 douglas 1 {
745 Douglas Thrift 349 unsigned inURL(0), inTitle(0), inText(0);
746 douglas 1
747 douglas 15 inURL = find(excluded[index], lowerURL.substr(7));
748 douglas 1 inTitle = find(excluded[index], lowerTitle);
749     inText = find(excluded[index], lowerText);
750    
751     inURLs.push_back(inURL);
752     inTitles.push_back(inTitle);
753     inTexts.push_back(inText);
754     }
755    
756 Douglas Thrift 349 unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles),
757     inText(evaluate(inTexts));
758 douglas 1
759     excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
760     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
761     inText : 0;
762     }
763    
764     void Ranker::checkEitherOr()
765     {
766     vector<unsigned> inURLs, inTitles, inTexts;
767    
768 Douglas Thrift 349 for (unsigned index(0); index < eitherOr.size(); index++)
769 douglas 1 {
770     vector<unsigned> inURLz, inTitlez, inTextz;
771 Douglas Thrift 349 unsigned inURL(0), inTitle(0), inText(0);
772 douglas 1 vector<string> words;
773 Douglas Thrift 349 unsigned begin(0), found;
774 douglas 1
775     do
776     {
777     found = eitherOr[index].find(" OR ", begin);
778    
779     if (found != string::npos)
780     {
781     words.push_back(eitherOr[index].substr(begin, found - begin));
782     }
783     else
784     {
785     words.push_back(eitherOr[index].substr(begin));
786     }
787    
788     begin = found + 4;
789     }
790     while (begin < eitherOr[index].length() && found != string::npos);
791    
792 Douglas Thrift 349 for (unsigned number(0); number < words.size(); number++)
793 douglas 1 {
794 Douglas Thrift 349 unsigned inURL(0), inTitle(0), inText(0);
795 douglas 1
796     if (words[number].find("URL ") == 0)
797     {
798 douglas 15 inURL = find(words[number].substr(4), lowerURL.substr(7));
799 douglas 1
800     if (inURL)
801     {
802 douglas 15 inTitle = find(words[number].substr(4), lowerTitle,
803 douglas 1 occurrencesTitle);
804 douglas 15 inText = find(words[number].substr(4), lowerText,
805 douglas 1 occurrencesText);
806    
807     if (!inTitle) inTitle++;
808     if (!inText) inText++;
809     }
810     }
811     else if (words[number].find("TITLE ") == 0)
812     {
813 douglas 15 inTitle = find(words[number].substr(6), lowerTitle,
814 douglas 1 occurrencesTitle);
815    
816     if (inTitle)
817     {
818 douglas 15 inURL = find(words[number].substr(6), lowerURL.substr(7));
819     inText = find(words[number].substr(6), lowerText,
820 douglas 1 occurrencesText);
821    
822     if (!inURL) inURL++;
823     if (!inText) inText++;
824     }
825     }
826     else if (words[number].find("TEXT ") == 0)
827     {
828 douglas 15 inText = find(words[number].substr(5), lowerText,
829 douglas 1 occurrencesText);
830    
831     if (inText)
832     {
833 douglas 15 inURL = find(words[number].substr(5), lowerURL.substr(7));
834     inTitle = find(words[number].substr(5), lowerTitle,
835 douglas 1 occurrencesTitle);
836    
837     if (!inURL) inURL++;
838     if (!inTitle) inTitle++;
839     }
840     }
841     else
842     {
843 douglas 15 inURL = find(words[number], lowerURL.substr(7));
844 douglas 1 inTitle = find(words[number], lowerTitle, occurrencesTitle);
845     inText = find(words[number], lowerText, occurrencesText);
846     }
847    
848     inURLz.push_back(inURL);
849     inTitlez.push_back(inTitle);
850     inTextz.push_back(inText);
851     }
852    
853 Douglas Thrift 349 for (unsigned number0(0); number0 < inURLz.size(); number0++)
854 douglas 1 {
855     inURL += inURLz[number0];
856     }
857    
858 Douglas Thrift 349 for (unsigned number1(0); number1 < inTitlez.size(); number1++)
859 douglas 1 {
860     inTitle += inTitlez[number1];
861     }
862    
863 Douglas Thrift 349 for (unsigned number2(0); number2 < inTextz.size(); number2++)
864 douglas 1 {
865     inText += inTextz[number2];
866     }
867    
868     inURLs.push_back(inURL);
869     inTitles.push_back(inTitle);
870     inTexts.push_back(inText);
871    
872     inURLz.clear();
873     inTitlez.clear();
874     inTextz.clear();
875     words.clear();
876     }
877    
878 Douglas Thrift 349 unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles),
879     inText(evaluate(inTexts));
880 douglas 1
881     eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title))
882     || (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle +
883     inText : 0;
884     }
885    
886 douglas 15 unsigned Ranker::find(string word, const string& where)
887 douglas 1 {
888 Douglas Thrift 349 unsigned value(0);
889 douglas 1
890     decrap(word);
891    
892 Douglas Thrift 355 if (word.empty())
893 douglas 1 {
894     // this can happen if a word is all crap characters
895     value++;
896     }
897 Douglas Thrift 349 else if (word.find_first_of(" \n\t") == string::npos)
898 douglas 1 {
899 Douglas Thrift 349 unsigned begin(0), found;
900    
901 douglas 1 do
902     {
903     found = where.find(word, begin);
904    
905     if (found != string::npos)
906     {
907 Douglas Thrift 349 bool isBefore, isAfter, before(false), after(false);
908    
909 douglas 1 isBefore = found - 1 > 0;
910     isAfter = found + word.length() < where.length();
911    
912     if (isBefore) before = isalnum(where[found - 1]) != 0;
913     if (isAfter) after = isalnum(where[found + word.length()]) != 0;
914    
915     if (!before && !after)
916     {
917     value++;
918     }
919     }
920    
921     begin = found + word.length();
922     }
923     while (found != string::npos && begin < where.length());
924     }
925     else
926     {
927     value = phrase(word, where);
928     }
929    
930     return value;
931     }
932    
933 douglas 15 unsigned Ranker::find(string word, const string& where, map<unsigned,
934     unsigned>& occurrences)
935 douglas 1 {
936 Douglas Thrift 349 unsigned value(0);
937 douglas 1
938     decrap(word);
939    
940 Douglas Thrift 355 if (word.empty())
941 douglas 1 {
942     // this can happen if a word is all crap characters
943     value++;
944     }
945     else if (word.find_first_of(" \n ") == string::npos)
946     {
947 Douglas Thrift 349 unsigned begin(0), found;
948    
949 douglas 1 do
950     {
951     found = where.find(word, begin);
952    
953     if (found != string::npos)
954     {
955 Douglas Thrift 349 bool isBefore, isAfter, before(false), after(false);
956    
957 douglas 1 isBefore = found - 1 > 0;
958     isAfter = found + word.length() < where.length();
959    
960     if (isBefore) before = isalnum(where[found - 1]) != 0;
961     if (isAfter) after = isalnum(where[found + word.length()]) != 0;
962    
963     if (!before && !after)
964     {
965     value++;
966    
967     occurrences.insert(pair<unsigned, unsigned>(found,
968     word.length()));
969     }
970     }
971    
972     begin = found + word.length();
973     }
974     while (found != string::npos && begin < where.length());
975     }
976     else
977     {
978     value = phrase(word, where, occurrences);
979     }
980    
981     return value;
982     }
983    
984 douglas 15 unsigned Ranker::phrase(const string& phrase, const string& where)
985 douglas 1 {
986 Douglas Thrift 349 unsigned value(0);
987 douglas 1 vector<string> words;
988 Douglas Thrift 349 unsigned begin(0), space;
989 douglas 1
990     do
991     {
992     space = phrase.find(' ', begin);
993    
994     words.push_back(phrase.substr(begin, space - begin));
995    
996     begin = space + 1;
997     }
998     while (space != string::npos && begin < phrase.length());
999    
1000     begin = 0;
1001 Douglas Thrift 349
1002     unsigned counter(0);
1003    
1004 douglas 1 do
1005     {
1006     value += this->phrase(words, 0, begin, true, where);
1007     }
1008     while (begin < where.length());
1009    
1010     return value;
1011     }
1012    
1013 douglas 15 unsigned Ranker::phrase(const string& phrase, const string& where,
1014     map<unsigned, unsigned>& occurrences)
1015 douglas 1 {
1016 Douglas Thrift 349 unsigned value(0);
1017 douglas 1 vector<string> words;
1018 Douglas Thrift 349 unsigned begin(0), space;
1019 douglas 1
1020     do
1021     {
1022     space = phrase.find(' ', begin);
1023    
1024     words.push_back(phrase.substr(begin, space - begin));
1025    
1026     begin = space + 1;
1027     }
1028     while (space != string::npos && begin < phrase.length());
1029    
1030     begin = 0;
1031 Douglas Thrift 349
1032 douglas 1 do
1033     {
1034     value += this->phrase(words, 0, begin, true, where, occurrences);
1035     }
1036     while (begin < where.length());
1037    
1038     return value;
1039     }
1040    
1041 douglas 15 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1042     begin, bool start, const string& where)
1043 douglas 1 {
1044 Douglas Thrift 349 unsigned value(0);
1045     bool end(!(word + 1 < words.size()));
1046     unsigned found(where.find(words[word], begin)), newBegin(found +
1047     words[word].length());
1048 douglas 1
1049     if (found != string::npos)
1050     {
1051 Douglas Thrift 349 bool isBefore, isAfter, before(false), after(false);
1052    
1053 douglas 1 isBefore = found - 1 > 0;
1054     isAfter = found + words[word].length() < where.length();
1055    
1056     if (isBefore) before = isalnum(where[found - 1]) != 0;
1057     if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1058    
1059     if (!before && !after)
1060     {
1061 Douglas Thrift 349 bool between(true);
1062    
1063 douglas 1 if (!start)
1064     {
1065     for (unsigned index = begin + 1; index < found - 1; index++)
1066     {
1067     if (isalnum(where[index]))
1068     {
1069     between = false;
1070     break;
1071     }
1072     }
1073     }
1074    
1075     if (between)
1076     {
1077     if (end)
1078     {
1079     begin = newBegin;
1080     value = 1;
1081     }
1082     else
1083     {
1084     value = phrase(words, (word + 1), newBegin, false, where);
1085     }
1086     }
1087     }
1088     }
1089    
1090     if (start)
1091     {
1092     if (found != string::npos)
1093     {
1094     begin = newBegin;
1095     }
1096     else
1097     {
1098     begin = string::npos;
1099     }
1100     }
1101    
1102     return value;
1103     }
1104    
1105 douglas 15 unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned&
1106     begin, bool start, const string& where, map<unsigned, unsigned>&
1107     occurrences)
1108 douglas 1 {
1109 Douglas Thrift 349 unsigned value(0);
1110     bool end(!(word + 1 < words.size()));
1111     unsigned found(where.find(words[word], begin)), newBegin(found +
1112     words[word].length());
1113 douglas 1
1114     if (found != string::npos)
1115     {
1116 Douglas Thrift 349 bool isBefore, isAfter, before(false), after(false);
1117    
1118 douglas 1 isBefore = found - 1 > 0;
1119     isAfter = found + words[word].length() < where.length();
1120    
1121     if (isBefore) before = isalnum(where[found - 1]) != 0;
1122     if (isAfter) after = isalnum(where[found + words[word].length()]) != 0;
1123    
1124     if (!before && !after)
1125     {
1126 Douglas Thrift 349 bool between(true);
1127    
1128 douglas 1 if (!start)
1129     {
1130     for (unsigned index = begin + 1; index < found - 1; index++)
1131     {
1132     if (isalnum(where[index]))
1133     {
1134     between = false;
1135     break;
1136     }
1137     }
1138     }
1139    
1140     if (between)
1141     {
1142     occurrences.insert(pair<unsigned, unsigned>(found,
1143     words[word].length()));
1144    
1145     if (end)
1146     {
1147     begin = newBegin;
1148     value = 1;
1149     }
1150     else
1151     {
1152     value = phrase(words, (word + 1), newBegin, false, where,
1153     occurrences);
1154     }
1155     }
1156     }
1157     }
1158    
1159     if (start)
1160     {
1161     if (found != string::npos)
1162     {
1163     begin = newBegin;
1164     }
1165     else
1166     {
1167     begin = string::npos;
1168     }
1169     }
1170    
1171     return value;
1172     }
1173    
1174     unsigned Ranker::evaluate(vector<unsigned>& ins)
1175     {
1176 Douglas Thrift 349 unsigned in(0);
1177 douglas 1
1178 Douglas Thrift 349 for (unsigned index(0); index < ins.size(); index++)
1179 douglas 1 {
1180     if (ins[index] > 0)
1181     {
1182     in += ins[index];
1183     }
1184     else
1185     {
1186     in = 0;
1187     break;
1188     }
1189     }
1190    
1191     return in;
1192     }
1193    
1194     void Ranker::decrap(string& crap)
1195     {
1196 Douglas Thrift 349 unsigned begin(0), found;
1197    
1198 douglas 1 do
1199     {
1200 douglas 46 // &, _, +, and # are not considered crap
1201     found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin);
1202 douglas 1
1203     if (found != string::npos)
1204     {
1205     crap[found] = ' ';
1206     }
1207    
1208     begin = found + 1;
1209     }
1210     while (found != string::npos && begin < crap.length());
1211    
1212     normalize(crap);
1213     }

Properties

Name Value
svn:eol-style native
svn:keywords Id