1 |
/* ============================================================================ |
2 |
* Douglas Thrift's Search Engine License |
3 |
* |
4 |
* Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved. |
5 |
* Redistribution and use in source and binary forms, with or without |
6 |
* modification, are permitted provided that the following conditions are met: |
7 |
* |
8 |
* 1. Redistributions of source code must retain the above copyright notice, |
9 |
* this list of conditions and the following disclaimer. |
10 |
* |
11 |
* 2. Redistributions in binary form must reproduce the above copyright notice, |
12 |
* this list of conditions and the following disclaimer in the documentation |
13 |
* and/or other materials provided with the distribution. |
14 |
* |
15 |
* 3. The end-user documentation included with the redistribution, if any, must |
16 |
* include the following acknowledgment: |
17 |
* |
18 |
* "This product includes software developed by Douglas Thrift |
19 |
* (http://computers.douglasthrift.net/searchengine/)." |
20 |
* |
21 |
* Alternately, this acknowledgment may appear in the software itself, if |
22 |
* and wherever such third-party acknowledgments normally appear. |
23 |
* |
24 |
* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not |
25 |
* be used to endorse or promote products derived from this software without |
26 |
* specific prior written permission. For written permission, please visit |
27 |
* http://www.douglasthrift.net/contact.cgi for contact information. |
28 |
* |
29 |
* 5. Products derived from this software may not be called "Douglas Thrift's |
30 |
* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their |
31 |
* name, without prior written permission. |
32 |
* |
33 |
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
34 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
35 |
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
36 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
37 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
38 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, |
39 |
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
40 |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
41 |
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
42 |
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 |
* ============================================================================ |
44 |
*/ |
45 |
// Douglas Thrift's Search Engine Ranker |
46 |
// |
47 |
// Douglas Thrift |
48 |
// |
49 |
// $Id$ |
50 |
|
51 |
#include "Ranker.hpp" |
52 |
|
53 |
void Ranker::rank(vector<string> query) |
54 |
{ |
55 |
vector<string> prep; |
56 |
|
57 |
for (size_t index(0); index < query.size(); index++) |
58 |
{ |
59 |
if (query[index] == "allintitle:" && index == 0) |
60 |
{ |
61 |
allIn = title; |
62 |
} |
63 |
else if (query[index] == "allinurl:" && index == 0) |
64 |
{ |
65 |
allIn = url; |
66 |
} |
67 |
else if (query[index] == "allintext:" && index == 0) |
68 |
{ |
69 |
allIn = text; |
70 |
} |
71 |
else if (query[index].find("site:") == 0 && query[index].size() > 5) |
72 |
{ |
73 |
site = query[index].substr(5); |
74 |
} |
75 |
else if (query[index].find("intitle:") == 0 && query[index].size() > 8) |
76 |
{ |
77 |
prep.push_back("TITLE " + query[index].substr(8)); |
78 |
} |
79 |
else if (query[index].find("inurl:") == 0 && query[index].size() > 6) |
80 |
{ |
81 |
prep.push_back("URL " + query[index].substr(6)); |
82 |
} |
83 |
else if (query[index].find("intext:") == 0 && query[index].size() > 7) |
84 |
{ |
85 |
prep.push_back("TEXT " + query[index].substr(7)); |
86 |
} |
87 |
else |
88 |
{ |
89 |
prep.push_back(query[index]); |
90 |
} |
91 |
} |
92 |
|
93 |
if (prep.size() > 0) |
94 |
{ |
95 |
bool or_(false); |
96 |
|
97 |
for (size_t index(0); index < prep.size(); index++) |
98 |
{ |
99 |
bool exclude(false); |
100 |
|
101 |
if (prep[index].find('+') == 0) |
102 |
{ |
103 |
prep[index].erase(0, 1); |
104 |
} |
105 |
else if (prep[index].find('-') == 0) |
106 |
{ |
107 |
exclude = true; |
108 |
|
109 |
prep[index].erase(0, 1); |
110 |
} |
111 |
|
112 |
if (or_) |
113 |
{ |
114 |
if (prep[index].find(" OR") == string::npos) |
115 |
{ |
116 |
or_ = false; |
117 |
} |
118 |
|
119 |
eitherOr[eitherOr.size() - 1] += ' ' + prep[index]; |
120 |
} |
121 |
else if (exclude) |
122 |
{ |
123 |
excluded.push_back(prep[index]); |
124 |
} |
125 |
else if (prep[index].find(" OR") != string::npos) |
126 |
{ |
127 |
or_ = true; |
128 |
|
129 |
eitherOr.push_back(prep[index]); |
130 |
} |
131 |
else |
132 |
{ |
133 |
required.push_back(prep[index]); |
134 |
} |
135 |
} |
136 |
} |
137 |
|
138 |
rank(); |
139 |
} |
140 |
|
141 |
void Ranker::setSample() |
142 |
{ |
143 |
map<unsigned, unsigned>::iterator itor; |
144 |
multimap<unsigned, map<unsigned, unsigned>::iterator> distances; |
145 |
|
146 |
for (itor = occurrencesText.begin(); itor != occurrencesText.end(); itor++) |
147 |
{ |
148 |
unsigned distance; |
149 |
|
150 |
if (++itor != occurrencesText.end()) |
151 |
{ |
152 |
unsigned next(itor->first); |
153 |
|
154 |
itor--; |
155 |
|
156 |
distance = next - (itor->first + itor->second); |
157 |
} |
158 |
else |
159 |
{ |
160 |
distance = string::npos; |
161 |
|
162 |
itor--; |
163 |
} |
164 |
|
165 |
distances.insert(pair<unsigned, map<unsigned, |
166 |
unsigned>::iterator>(distance, itor)); |
167 |
} |
168 |
|
169 |
if (distances.begin() != distances.end()) |
170 |
{ |
171 |
itor = distances.begin()->second; |
172 |
} |
173 |
|
174 |
string portion; |
175 |
size_t sampleLength(0), begin(0), end(string::npos); |
176 |
|
177 |
while (sampleLength < sampleMax && itor != occurrencesText.end()) |
178 |
{ |
179 |
unsigned found(itor->first), length(itor->second); |
180 |
|
181 |
for (unsigned index(found); index > begin; index--) |
182 |
{ |
183 |
if (found - index >= sampleMax - sampleLength - length) |
184 |
{ |
185 |
while (index < found) |
186 |
{ |
187 |
if (isspace(getText()[index++])) break; |
188 |
} |
189 |
|
190 |
begin = index; |
191 |
|
192 |
break; |
193 |
} |
194 |
else if ((index > begin ? (isupper(getText()[index]) && |
195 |
!isalnum(getText()[index - 1])) : isupper(getText()[index])) && |
196 |
index != found) |
197 |
{ |
198 |
begin = index; |
199 |
|
200 |
break; |
201 |
} |
202 |
} |
203 |
|
204 |
if (end + 1 != begin) sample += " <strong>...</strong> "; |
205 |
|
206 |
portion = getText().substr(begin, found - begin); |
207 |
|
208 |
sampleLength += portion.length(); |
209 |
|
210 |
entities(portion, '&', "&"); |
211 |
entities(portion, '\"', """); |
212 |
entities(portion, '<', "<"); |
213 |
entities(portion, '>', ">"); |
214 |
|
215 |
sample += portion + "<strong>"; |
216 |
|
217 |
portion = getText().substr(found, length); |
218 |
|
219 |
sampleLength += portion.length(); |
220 |
|
221 |
entities(portion, '&', "&"); |
222 |
entities(portion, '\"', """); |
223 |
entities(portion, '<', "<"); |
224 |
entities(portion, '>', ">"); |
225 |
|
226 |
sample += portion + "</strong>"; |
227 |
|
228 |
begin = found + length; |
229 |
end = begin - 1; |
230 |
|
231 |
if (++itor != occurrencesText.end()) |
232 |
{ |
233 |
if (itor->first + itor->second < begin + sampleMax - sampleLength) |
234 |
{ |
235 |
portion = getText().substr(begin, itor->first - begin); |
236 |
|
237 |
sampleLength += portion.length(); |
238 |
|
239 |
entities(portion, '&', "&"); |
240 |
entities(portion, '\"', """); |
241 |
entities(portion, '<', "<"); |
242 |
entities(portion, '>', ">"); |
243 |
|
244 |
sample += portion; |
245 |
|
246 |
begin = itor->first; |
247 |
end = begin - 1; |
248 |
} |
249 |
else |
250 |
{ |
251 |
for (end = begin + sampleMax - sampleLength; end > begin; |
252 |
end--) |
253 |
{ |
254 |
if (isspace(getText()[end])) break; |
255 |
} |
256 |
|
257 |
portion = getText().substr(begin, end - begin + 1); |
258 |
|
259 |
sampleLength += portion.length(); |
260 |
|
261 |
entities(portion, '&', "&"); |
262 |
entities(portion, '\"', """); |
263 |
entities(portion, '<', "<"); |
264 |
entities(portion, '>', ">"); |
265 |
|
266 |
sample += portion + " <strong>...</strong>"; |
267 |
|
268 |
break; |
269 |
} |
270 |
} |
271 |
else |
272 |
{ |
273 |
for (end = begin + sampleMax - sampleLength; end > begin && (end + |
274 |
1 < getText().length()); end--) |
275 |
{ |
276 |
if (isspace(getText()[end])) break; |
277 |
} |
278 |
|
279 |
if (end >= getText().length()) end = getText().length() - 1; |
280 |
|
281 |
portion = getText().substr(begin, end - begin + 1); |
282 |
|
283 |
sampleLength += portion.length(); |
284 |
|
285 |
entities(portion, '&', "&"); |
286 |
entities(portion, '\"', """); |
287 |
entities(portion, '<', "<"); |
288 |
entities(portion, '>', ">"); |
289 |
|
290 |
sample += portion; |
291 |
|
292 |
if (end + 1 < getText().length()) |
293 |
{ |
294 |
sample += " <strong>...</strong>"; |
295 |
} |
296 |
|
297 |
break; |
298 |
} |
299 |
} |
300 |
|
301 |
if (sample == "") |
302 |
{ |
303 |
for (end = sampleMax; end > 0 && (end + 1 < getText().length()); end--) |
304 |
{ |
305 |
if (isspace(getText()[end])) break; |
306 |
} |
307 |
|
308 |
sample = getText().substr(0, end + 1); |
309 |
|
310 |
entities(sample, '&', "&"); |
311 |
entities(sample, '\"', """); |
312 |
entities(sample, '<', "<"); |
313 |
entities(sample, '>', ">"); |
314 |
|
315 |
if (end + 1 < getText().length()) |
316 |
{ |
317 |
sample += " <strong>...</strong>"; |
318 |
} |
319 |
else if (sample == "") |
320 |
{ |
321 |
sample = "<strong>...</strong>"; |
322 |
} |
323 |
} |
324 |
} |
325 |
|
326 |
string Ranker::getTitle() |
327 |
{ |
328 |
string title, portion; |
329 |
size_t begin(0); |
330 |
|
331 |
for (map<unsigned, unsigned>::iterator itor = occurrencesTitle.begin(); |
332 |
itor != occurrencesTitle.end(); itor++) |
333 |
{ |
334 |
unsigned found(itor->first), length(itor->second); |
335 |
|
336 |
portion = Page::getTitle().substr(begin, found - begin); |
337 |
|
338 |
entities(portion, '&', "&"); |
339 |
entities(portion, '\"', """); |
340 |
entities(portion, '<', "<"); |
341 |
entities(portion, '>', ">"); |
342 |
|
343 |
title += portion + "<strong>"; |
344 |
|
345 |
portion = Page::getTitle().substr(found, length); |
346 |
|
347 |
entities(portion, '&', "&"); |
348 |
entities(portion, '\"', """); |
349 |
entities(portion, '<', "<"); |
350 |
entities(portion, '>', ">"); |
351 |
|
352 |
title += portion + "</strong>"; |
353 |
|
354 |
begin = found + length; |
355 |
} |
356 |
|
357 |
portion = Page::getTitle().substr(begin); |
358 |
|
359 |
entities(portion, '&', "&"); |
360 |
entities(portion, '\"', """); |
361 |
entities(portion, '<', "<"); |
362 |
entities(portion, '>', ">"); |
363 |
|
364 |
title += portion; |
365 |
|
366 |
return title; |
367 |
} |
368 |
|
369 |
string Ranker::getDescription() |
370 |
{ |
371 |
string description, portion; |
372 |
unsigned begin(0); |
373 |
|
374 |
for (map<unsigned, unsigned>::iterator itor = |
375 |
occurrencesDescription.begin(); itor != occurrencesDescription.end(); |
376 |
itor++) |
377 |
{ |
378 |
unsigned found(itor->first), length(itor->second); |
379 |
|
380 |
portion = Page::getDescription().substr(begin, found - begin); |
381 |
|
382 |
entities(portion, '&', "&"); |
383 |
entities(portion, '\"', """); |
384 |
entities(portion, '<', "<"); |
385 |
entities(portion, '>', ">"); |
386 |
|
387 |
description += portion + "<strong>"; |
388 |
|
389 |
portion = Page::getDescription().substr(found, length); |
390 |
|
391 |
entities(portion, '&', "&"); |
392 |
entities(portion, '\"', """); |
393 |
entities(portion, '<', "<"); |
394 |
entities(portion, '>', ">"); |
395 |
|
396 |
description += portion + "</strong>"; |
397 |
|
398 |
begin = found + length; |
399 |
} |
400 |
|
401 |
portion = Page::getDescription().substr(begin); |
402 |
|
403 |
entities(portion, '&', "&"); |
404 |
entities(portion, '\"', """); |
405 |
entities(portion, '<', "<"); |
406 |
entities(portion, '>', ">"); |
407 |
|
408 |
description += portion; |
409 |
|
410 |
return description; |
411 |
} |
412 |
|
413 |
bool Ranker::operator==(const unsigned number) const |
414 |
{ |
415 |
return value == number; |
416 |
} |
417 |
|
418 |
bool Ranker::operator==(const Ranker& ranker) const |
419 |
{ |
420 |
return value == ranker.value; |
421 |
} |
422 |
|
423 |
bool Ranker::operator!=(const unsigned number) const |
424 |
{ |
425 |
return value != number; |
426 |
} |
427 |
|
428 |
bool Ranker::operator!=(const Ranker& ranker) const |
429 |
{ |
430 |
return value != ranker.value; |
431 |
} |
432 |
|
433 |
bool Ranker::operator<(const unsigned number) const |
434 |
{ |
435 |
return value < number; |
436 |
} |
437 |
|
438 |
bool Ranker::operator<(const Ranker& ranker) const |
439 |
{ |
440 |
return value < ranker.value; |
441 |
} |
442 |
|
443 |
bool Ranker::operator>(const unsigned number) const |
444 |
{ |
445 |
return value > number; |
446 |
} |
447 |
|
448 |
bool Ranker::operator >(const Ranker& ranker) const |
449 |
{ |
450 |
return value > ranker.value; |
451 |
} |
452 |
|
453 |
void Ranker::rank() |
454 |
{ |
455 |
lowerAddress = tolower(getAddress()); |
456 |
|
457 |
if (site == "" || lowerAddress.rfind(site) == lowerAddress.length() - |
458 |
site.length()) |
459 |
{ |
460 |
bool isRequired(required.size() > 0), isExcluded(excluded.size() > 0), |
461 |
isEitherOr(eitherOr.size() > 0); |
462 |
|
463 |
lowerURL = tolower(getURL()); |
464 |
lowerTitle = tolower(Page::getTitle()); |
465 |
lowerText = tolower(Page::getText()); |
466 |
|
467 |
if (isRequired) checkRequired(); |
468 |
if (isExcluded && (isRequired || isEitherOr)) checkExcluded(); |
469 |
if (isEitherOr) checkEitherOr(); |
470 |
|
471 |
if (isRequired && isExcluded && isEitherOr) |
472 |
{ |
473 |
value += requiredValue && !excludedValue && eitherOrValue ? |
474 |
requiredValue + eitherOrValue : 0; |
475 |
} |
476 |
else if (isRequired && isExcluded) |
477 |
{ |
478 |
value += requiredValue && !excludedValue ? requiredValue : 0; |
479 |
} |
480 |
else if (isRequired && isEitherOr) |
481 |
{ |
482 |
value += requiredValue && eitherOrValue ? requiredValue + |
483 |
eitherOrValue : 0; |
484 |
} |
485 |
else if (isExcluded && isEitherOr) |
486 |
{ |
487 |
value += !excludedValue && eitherOrValue ? eitherOrValue : 0; |
488 |
} |
489 |
else if (isRequired) |
490 |
{ |
491 |
value += requiredValue; |
492 |
} |
493 |
else if (isEitherOr) |
494 |
{ |
495 |
value += eitherOrValue; |
496 |
} |
497 |
else |
498 |
{ |
499 |
// do nothing this is a bad search and warrants no results |
500 |
} |
501 |
|
502 |
if (value > 0) |
503 |
{ |
504 |
string lowerDescription(tolower(Page::getDescription())); |
505 |
|
506 |
for (unsigned index(0); index < required.size(); index++) |
507 |
{ |
508 |
if (required[index].find("URL ") == 0) |
509 |
{ |
510 |
value += find(required[index].substr(4), lowerDescription, |
511 |
occurrencesDescription); |
512 |
} |
513 |
else if (required[index].find("TITLE ") == 0) |
514 |
{ |
515 |
value += find(required[index].substr(6), lowerDescription, |
516 |
occurrencesDescription); |
517 |
} |
518 |
else if (required[index].find("TEXT ") == 0) |
519 |
{ |
520 |
value += find(required[index].substr(5), lowerDescription, |
521 |
occurrencesDescription); |
522 |
} |
523 |
else |
524 |
{ |
525 |
value += find(required[index], lowerDescription, |
526 |
occurrencesDescription); |
527 |
} |
528 |
} |
529 |
|
530 |
for (unsigned index1(0); index1 < eitherOr.size(); index1++) |
531 |
{ |
532 |
vector<string> words; |
533 |
unsigned begin(0), found; |
534 |
|
535 |
do |
536 |
{ |
537 |
found = eitherOr[index1].find(" OR ", begin); |
538 |
|
539 |
if (found != string::npos) |
540 |
{ |
541 |
words.push_back(eitherOr[index1].substr(begin, found - |
542 |
begin)); |
543 |
} |
544 |
else |
545 |
{ |
546 |
words.push_back(eitherOr[index1].substr(begin)); |
547 |
} |
548 |
|
549 |
begin = found + 4; |
550 |
} |
551 |
while (begin < eitherOr[index1].length() && found != |
552 |
string::npos); |
553 |
|
554 |
for (unsigned number(0); number < words.size(); number++) |
555 |
{ |
556 |
if (words[index1].find("URL ") == 0) |
557 |
{ |
558 |
value += find(words[index1].substr(4), |
559 |
lowerDescription, occurrencesDescription); |
560 |
} |
561 |
else if (words[index1].find("TITLE ") == 0) |
562 |
{ |
563 |
value += find(words[index1].substr(6), |
564 |
lowerDescription, occurrencesDescription); |
565 |
} |
566 |
else if (words[index1].find("TEXT ") == 0) |
567 |
{ |
568 |
value += find(words[index1].substr(5), |
569 |
lowerDescription, occurrencesDescription); |
570 |
} |
571 |
else |
572 |
{ |
573 |
value += find(words[index1], lowerDescription, |
574 |
occurrencesDescription); |
575 |
} |
576 |
} |
577 |
} |
578 |
|
579 |
for (unsigned index2(0); index2 < getHeadings().size(); index2++) |
580 |
{ |
581 |
string lowerHeading = string(getHeadings()[index2].length(), |
582 |
' '); |
583 |
|
584 |
for (unsigned number(0); number < |
585 |
getHeadings()[index2].length(); number++) |
586 |
{ |
587 |
lowerHeading[number] = tolower( |
588 |
getHeadings()[index2][number]); |
589 |
} |
590 |
|
591 |
for (unsigned number0(0); number0 < required.size(); number0++) |
592 |
{ |
593 |
if (required[number0].find("URL ") == 0) |
594 |
{ |
595 |
value += find(required[number0].substr(4), |
596 |
lowerHeading); |
597 |
} |
598 |
else if (required[number0].find("TITLE ") == 0) |
599 |
{ |
600 |
value += find(required[number0].substr(6), |
601 |
lowerHeading); |
602 |
} |
603 |
else if (required[number0].find("TEXT ") == 0) |
604 |
{ |
605 |
value += find(required[number0].substr(5), |
606 |
lowerHeading); |
607 |
} |
608 |
else |
609 |
{ |
610 |
value += find(required[number0], lowerHeading); |
611 |
} |
612 |
} |
613 |
|
614 |
for (unsigned number1(0); number1 < eitherOr.size(); number1++) |
615 |
{ |
616 |
vector<string> words; |
617 |
unsigned begin(0), found; |
618 |
|
619 |
do |
620 |
{ |
621 |
found = eitherOr[number1].find(" OR ", begin); |
622 |
|
623 |
if (found != string::npos) |
624 |
{ |
625 |
words.push_back(eitherOr[number1].substr(begin, |
626 |
found - begin)); |
627 |
} |
628 |
else |
629 |
{ |
630 |
words.push_back(eitherOr[number1].substr(begin)); |
631 |
} |
632 |
|
633 |
begin = found + 4; |
634 |
} |
635 |
while (begin < eitherOr[number1].length() && found != |
636 |
string::npos); |
637 |
|
638 |
for (unsigned number(0); number < words.size(); number++) |
639 |
{ |
640 |
if (words[number].find("URL ") == 0) |
641 |
{ |
642 |
value += find(words[number].substr(4), |
643 |
lowerHeading); |
644 |
} |
645 |
else if (words[number].find("TITLE ") == 0) |
646 |
{ |
647 |
value += find(words[number].substr(6), |
648 |
lowerHeading); |
649 |
} |
650 |
else if (words[number].find("TEXT ") == 0) |
651 |
{ |
652 |
value += find(words[number].substr(5), |
653 |
lowerHeading); |
654 |
} |
655 |
else |
656 |
{ |
657 |
value += find(words[number], lowerHeading); |
658 |
} |
659 |
} |
660 |
} |
661 |
} |
662 |
} |
663 |
} |
664 |
} |
665 |
|
666 |
void Ranker::checkRequired() |
667 |
{ |
668 |
vector<unsigned> inURLs, inTitles, inTexts; |
669 |
|
670 |
for (unsigned index(0); index < required.size(); index++) |
671 |
{ |
672 |
unsigned inURL(0), inTitle(0), inText(0); |
673 |
|
674 |
if (required[index].find("URL ") == 0) |
675 |
{ |
676 |
inURL = find(required[index].substr(4), lowerURL.substr(7)); |
677 |
|
678 |
if (inURL) |
679 |
{ |
680 |
inTitle = find(required[index].substr(4), lowerTitle, |
681 |
occurrencesTitle); |
682 |
inText = find(required[index].substr(4), lowerText, |
683 |
occurrencesText); |
684 |
|
685 |
if (!inTitle) inTitle++; |
686 |
if (!inText) inText++; |
687 |
} |
688 |
} |
689 |
else if (required[index].find("TITLE ") == 0) |
690 |
{ |
691 |
inTitle = find(required[index].substr(6), lowerTitle, |
692 |
occurrencesTitle); |
693 |
|
694 |
if (inTitle) |
695 |
{ |
696 |
inURL = find(required[index].substr(6), lowerURL.substr(7)); |
697 |
inText = find(required[index].substr(6), lowerText, |
698 |
occurrencesText); |
699 |
|
700 |
if (!inURL) inURL++; |
701 |
if (!inText) inText++; |
702 |
} |
703 |
} |
704 |
else if (required[index].find("TEXT ") == 0) |
705 |
{ |
706 |
inText = find(required[index].substr(5), lowerText, |
707 |
occurrencesText); |
708 |
|
709 |
if (inText) |
710 |
{ |
711 |
inURL = find(required[index].substr(5), lowerURL.substr(7)); |
712 |
inTitle = find(required[index].substr(5), lowerTitle, |
713 |
occurrencesTitle); |
714 |
|
715 |
if (!inURL) inURL++; |
716 |
if (!inTitle) inTitle++; |
717 |
} |
718 |
} |
719 |
else |
720 |
{ |
721 |
inURL = find(required[index], lowerURL.substr(7)); |
722 |
inTitle = find(required[index], lowerTitle, occurrencesTitle); |
723 |
inText = find(required[index], lowerText, occurrencesText); |
724 |
} |
725 |
|
726 |
inURLs.push_back(inURL); |
727 |
inTitles.push_back(inTitle); |
728 |
inTexts.push_back(inText); |
729 |
} |
730 |
|
731 |
unsigned inURL(evaluate(inURLs)), inTitle(evaluate(inTitles)), |
732 |
inText(evaluate(inTexts)); |
733 |
|
734 |
requiredValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) |
735 |
|| (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + |
736 |
inText : 0; |
737 |
} |
738 |
|
739 |
void Ranker::checkExcluded() |
740 |
{ |
741 |
vector<unsigned> inURLs, inTitles, inTexts; |
742 |
|
743 |
for (unsigned index(0); index < excluded.size(); index++) |
744 |
{ |
745 |
unsigned inURL(0), inTitle(0), inText(0); |
746 |
|
747 |
inURL = find(excluded[index], lowerURL.substr(7)); |
748 |
inTitle = find(excluded[index], lowerTitle); |
749 |
inText = find(excluded[index], lowerText); |
750 |
|
751 |
inURLs.push_back(inURL); |
752 |
inTitles.push_back(inTitle); |
753 |
inTexts.push_back(inText); |
754 |
} |
755 |
|
756 |
unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles), |
757 |
inText(evaluate(inTexts)); |
758 |
|
759 |
excludedValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) |
760 |
|| (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + |
761 |
inText : 0; |
762 |
} |
763 |
|
764 |
void Ranker::checkEitherOr() |
765 |
{ |
766 |
vector<unsigned> inURLs, inTitles, inTexts; |
767 |
|
768 |
for (unsigned index(0); index < eitherOr.size(); index++) |
769 |
{ |
770 |
vector<unsigned> inURLz, inTitlez, inTextz; |
771 |
unsigned inURL(0), inTitle(0), inText(0); |
772 |
vector<string> words; |
773 |
unsigned begin(0), found; |
774 |
|
775 |
do |
776 |
{ |
777 |
found = eitherOr[index].find(" OR ", begin); |
778 |
|
779 |
if (found != string::npos) |
780 |
{ |
781 |
words.push_back(eitherOr[index].substr(begin, found - begin)); |
782 |
} |
783 |
else |
784 |
{ |
785 |
words.push_back(eitherOr[index].substr(begin)); |
786 |
} |
787 |
|
788 |
begin = found + 4; |
789 |
} |
790 |
while (begin < eitherOr[index].length() && found != string::npos); |
791 |
|
792 |
for (unsigned number(0); number < words.size(); number++) |
793 |
{ |
794 |
unsigned inURL(0), inTitle(0), inText(0); |
795 |
|
796 |
if (words[number].find("URL ") == 0) |
797 |
{ |
798 |
inURL = find(words[number].substr(4), lowerURL.substr(7)); |
799 |
|
800 |
if (inURL) |
801 |
{ |
802 |
inTitle = find(words[number].substr(4), lowerTitle, |
803 |
occurrencesTitle); |
804 |
inText = find(words[number].substr(4), lowerText, |
805 |
occurrencesText); |
806 |
|
807 |
if (!inTitle) inTitle++; |
808 |
if (!inText) inText++; |
809 |
} |
810 |
} |
811 |
else if (words[number].find("TITLE ") == 0) |
812 |
{ |
813 |
inTitle = find(words[number].substr(6), lowerTitle, |
814 |
occurrencesTitle); |
815 |
|
816 |
if (inTitle) |
817 |
{ |
818 |
inURL = find(words[number].substr(6), lowerURL.substr(7)); |
819 |
inText = find(words[number].substr(6), lowerText, |
820 |
occurrencesText); |
821 |
|
822 |
if (!inURL) inURL++; |
823 |
if (!inText) inText++; |
824 |
} |
825 |
} |
826 |
else if (words[number].find("TEXT ") == 0) |
827 |
{ |
828 |
inText = find(words[number].substr(5), lowerText, |
829 |
occurrencesText); |
830 |
|
831 |
if (inText) |
832 |
{ |
833 |
inURL = find(words[number].substr(5), lowerURL.substr(7)); |
834 |
inTitle = find(words[number].substr(5), lowerTitle, |
835 |
occurrencesTitle); |
836 |
|
837 |
if (!inURL) inURL++; |
838 |
if (!inTitle) inTitle++; |
839 |
} |
840 |
} |
841 |
else |
842 |
{ |
843 |
inURL = find(words[number], lowerURL.substr(7)); |
844 |
inTitle = find(words[number], lowerTitle, occurrencesTitle); |
845 |
inText = find(words[number], lowerText, occurrencesText); |
846 |
} |
847 |
|
848 |
inURLz.push_back(inURL); |
849 |
inTitlez.push_back(inTitle); |
850 |
inTextz.push_back(inText); |
851 |
} |
852 |
|
853 |
for (unsigned number0(0); number0 < inURLz.size(); number0++) |
854 |
{ |
855 |
inURL += inURLz[number0]; |
856 |
} |
857 |
|
858 |
for (unsigned number1(0); number1 < inTitlez.size(); number1++) |
859 |
{ |
860 |
inTitle += inTitlez[number1]; |
861 |
} |
862 |
|
863 |
for (unsigned number2(0); number2 < inTextz.size(); number2++) |
864 |
{ |
865 |
inText += inTextz[number2]; |
866 |
} |
867 |
|
868 |
inURLs.push_back(inURL); |
869 |
inTitles.push_back(inTitle); |
870 |
inTexts.push_back(inText); |
871 |
|
872 |
inURLz.clear(); |
873 |
inTitlez.clear(); |
874 |
inTextz.clear(); |
875 |
words.clear(); |
876 |
} |
877 |
|
878 |
unsigned inURL(evaluate(inURLs)), inTitle = evaluate(inTitles), |
879 |
inText(evaluate(inTexts)); |
880 |
|
881 |
eitherOrValue += (inURL && (allIn == url)) || (inTitle && (allIn == title)) |
882 |
|| (inText && ((allIn == text) || (allIn == all))) ? inURL + inTitle + |
883 |
inText : 0; |
884 |
} |
885 |
|
886 |
unsigned Ranker::find(string word, const string& where) |
887 |
{ |
888 |
unsigned value(0); |
889 |
|
890 |
decrap(word); |
891 |
|
892 |
if (word == "") |
893 |
{ |
894 |
// this can happen if a word is all crap characters |
895 |
value++; |
896 |
} |
897 |
else if (word.find_first_of(" \n\t") == string::npos) |
898 |
{ |
899 |
unsigned begin(0), found; |
900 |
|
901 |
do |
902 |
{ |
903 |
found = where.find(word, begin); |
904 |
|
905 |
if (found != string::npos) |
906 |
{ |
907 |
bool isBefore, isAfter, before(false), after(false); |
908 |
|
909 |
isBefore = found - 1 > 0; |
910 |
isAfter = found + word.length() < where.length(); |
911 |
|
912 |
if (isBefore) before = isalnum(where[found - 1]) != 0; |
913 |
if (isAfter) after = isalnum(where[found + word.length()]) != 0; |
914 |
|
915 |
if (!before && !after) |
916 |
{ |
917 |
value++; |
918 |
} |
919 |
} |
920 |
|
921 |
begin = found + word.length(); |
922 |
} |
923 |
while (found != string::npos && begin < where.length()); |
924 |
} |
925 |
else |
926 |
{ |
927 |
value = phrase(word, where); |
928 |
} |
929 |
|
930 |
return value; |
931 |
} |
932 |
|
933 |
unsigned Ranker::find(string word, const string& where, map<unsigned, |
934 |
unsigned>& occurrences) |
935 |
{ |
936 |
unsigned value(0); |
937 |
|
938 |
decrap(word); |
939 |
|
940 |
if (word == "") |
941 |
{ |
942 |
// this can happen if a word is all crap characters |
943 |
value++; |
944 |
} |
945 |
else if (word.find_first_of(" \n ") == string::npos) |
946 |
{ |
947 |
unsigned begin(0), found; |
948 |
|
949 |
do |
950 |
{ |
951 |
found = where.find(word, begin); |
952 |
|
953 |
if (found != string::npos) |
954 |
{ |
955 |
bool isBefore, isAfter, before(false), after(false); |
956 |
|
957 |
isBefore = found - 1 > 0; |
958 |
isAfter = found + word.length() < where.length(); |
959 |
|
960 |
if (isBefore) before = isalnum(where[found - 1]) != 0; |
961 |
if (isAfter) after = isalnum(where[found + word.length()]) != 0; |
962 |
|
963 |
if (!before && !after) |
964 |
{ |
965 |
value++; |
966 |
|
967 |
occurrences.insert(pair<unsigned, unsigned>(found, |
968 |
word.length())); |
969 |
} |
970 |
} |
971 |
|
972 |
begin = found + word.length(); |
973 |
} |
974 |
while (found != string::npos && begin < where.length()); |
975 |
} |
976 |
else |
977 |
{ |
978 |
value = phrase(word, where, occurrences); |
979 |
} |
980 |
|
981 |
return value; |
982 |
} |
983 |
|
984 |
unsigned Ranker::phrase(const string& phrase, const string& where) |
985 |
{ |
986 |
unsigned value(0); |
987 |
vector<string> words; |
988 |
unsigned begin(0), space; |
989 |
|
990 |
do |
991 |
{ |
992 |
space = phrase.find(' ', begin); |
993 |
|
994 |
words.push_back(phrase.substr(begin, space - begin)); |
995 |
|
996 |
begin = space + 1; |
997 |
} |
998 |
while (space != string::npos && begin < phrase.length()); |
999 |
|
1000 |
begin = 0; |
1001 |
|
1002 |
unsigned counter(0); |
1003 |
|
1004 |
do |
1005 |
{ |
1006 |
value += this->phrase(words, 0, begin, true, where); |
1007 |
} |
1008 |
while (begin < where.length()); |
1009 |
|
1010 |
return value; |
1011 |
} |
1012 |
|
1013 |
unsigned Ranker::phrase(const string& phrase, const string& where, |
1014 |
map<unsigned, unsigned>& occurrences) |
1015 |
{ |
1016 |
unsigned value(0); |
1017 |
vector<string> words; |
1018 |
unsigned begin(0), space; |
1019 |
|
1020 |
do |
1021 |
{ |
1022 |
space = phrase.find(' ', begin); |
1023 |
|
1024 |
words.push_back(phrase.substr(begin, space - begin)); |
1025 |
|
1026 |
begin = space + 1; |
1027 |
} |
1028 |
while (space != string::npos && begin < phrase.length()); |
1029 |
|
1030 |
begin = 0; |
1031 |
|
1032 |
do |
1033 |
{ |
1034 |
value += this->phrase(words, 0, begin, true, where, occurrences); |
1035 |
} |
1036 |
while (begin < where.length()); |
1037 |
|
1038 |
return value; |
1039 |
} |
1040 |
|
1041 |
unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned& |
1042 |
begin, bool start, const string& where) |
1043 |
{ |
1044 |
unsigned value(0); |
1045 |
bool end(!(word + 1 < words.size())); |
1046 |
unsigned found(where.find(words[word], begin)), newBegin(found + |
1047 |
words[word].length()); |
1048 |
|
1049 |
if (found != string::npos) |
1050 |
{ |
1051 |
bool isBefore, isAfter, before(false), after(false); |
1052 |
|
1053 |
isBefore = found - 1 > 0; |
1054 |
isAfter = found + words[word].length() < where.length(); |
1055 |
|
1056 |
if (isBefore) before = isalnum(where[found - 1]) != 0; |
1057 |
if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; |
1058 |
|
1059 |
if (!before && !after) |
1060 |
{ |
1061 |
bool between(true); |
1062 |
|
1063 |
if (!start) |
1064 |
{ |
1065 |
for (unsigned index = begin + 1; index < found - 1; index++) |
1066 |
{ |
1067 |
if (isalnum(where[index])) |
1068 |
{ |
1069 |
between = false; |
1070 |
break; |
1071 |
} |
1072 |
} |
1073 |
} |
1074 |
|
1075 |
if (between) |
1076 |
{ |
1077 |
if (end) |
1078 |
{ |
1079 |
begin = newBegin; |
1080 |
value = 1; |
1081 |
} |
1082 |
else |
1083 |
{ |
1084 |
value = phrase(words, (word + 1), newBegin, false, where); |
1085 |
} |
1086 |
} |
1087 |
} |
1088 |
} |
1089 |
|
1090 |
if (start) |
1091 |
{ |
1092 |
if (found != string::npos) |
1093 |
{ |
1094 |
begin = newBegin; |
1095 |
} |
1096 |
else |
1097 |
{ |
1098 |
begin = string::npos; |
1099 |
} |
1100 |
} |
1101 |
|
1102 |
return value; |
1103 |
} |
1104 |
|
1105 |
unsigned Ranker::phrase(const vector<string>& words, unsigned word, unsigned& |
1106 |
begin, bool start, const string& where, map<unsigned, unsigned>& |
1107 |
occurrences) |
1108 |
{ |
1109 |
unsigned value(0); |
1110 |
bool end(!(word + 1 < words.size())); |
1111 |
unsigned found(where.find(words[word], begin)), newBegin(found + |
1112 |
words[word].length()); |
1113 |
|
1114 |
if (found != string::npos) |
1115 |
{ |
1116 |
bool isBefore, isAfter, before(false), after(false); |
1117 |
|
1118 |
isBefore = found - 1 > 0; |
1119 |
isAfter = found + words[word].length() < where.length(); |
1120 |
|
1121 |
if (isBefore) before = isalnum(where[found - 1]) != 0; |
1122 |
if (isAfter) after = isalnum(where[found + words[word].length()]) != 0; |
1123 |
|
1124 |
if (!before && !after) |
1125 |
{ |
1126 |
bool between(true); |
1127 |
|
1128 |
if (!start) |
1129 |
{ |
1130 |
for (unsigned index = begin + 1; index < found - 1; index++) |
1131 |
{ |
1132 |
if (isalnum(where[index])) |
1133 |
{ |
1134 |
between = false; |
1135 |
break; |
1136 |
} |
1137 |
} |
1138 |
} |
1139 |
|
1140 |
if (between) |
1141 |
{ |
1142 |
occurrences.insert(pair<unsigned, unsigned>(found, |
1143 |
words[word].length())); |
1144 |
|
1145 |
if (end) |
1146 |
{ |
1147 |
begin = newBegin; |
1148 |
value = 1; |
1149 |
} |
1150 |
else |
1151 |
{ |
1152 |
value = phrase(words, (word + 1), newBegin, false, where, |
1153 |
occurrences); |
1154 |
} |
1155 |
} |
1156 |
} |
1157 |
} |
1158 |
|
1159 |
if (start) |
1160 |
{ |
1161 |
if (found != string::npos) |
1162 |
{ |
1163 |
begin = newBegin; |
1164 |
} |
1165 |
else |
1166 |
{ |
1167 |
begin = string::npos; |
1168 |
} |
1169 |
} |
1170 |
|
1171 |
return value; |
1172 |
} |
1173 |
|
1174 |
unsigned Ranker::evaluate(vector<unsigned>& ins) |
1175 |
{ |
1176 |
unsigned in(0); |
1177 |
|
1178 |
for (unsigned index(0); index < ins.size(); index++) |
1179 |
{ |
1180 |
if (ins[index] > 0) |
1181 |
{ |
1182 |
in += ins[index]; |
1183 |
} |
1184 |
else |
1185 |
{ |
1186 |
in = 0; |
1187 |
break; |
1188 |
} |
1189 |
} |
1190 |
|
1191 |
return in; |
1192 |
} |
1193 |
|
1194 |
void Ranker::decrap(string& crap) |
1195 |
{ |
1196 |
unsigned begin(0), found; |
1197 |
|
1198 |
do |
1199 |
{ |
1200 |
// &, _, +, and # are not considered crap |
1201 |
found = crap.find_first_of("!\"$%\'()*,-./:;<=>?@[\\]^`{|}~", begin); |
1202 |
|
1203 |
if (found != string::npos) |
1204 |
{ |
1205 |
crap[found] = ' '; |
1206 |
} |
1207 |
|
1208 |
begin = found + 1; |
1209 |
} |
1210 |
while (found != string::npos && begin < crap.length()); |
1211 |
|
1212 |
normalize(crap); |
1213 |
} |