ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 15
Committed: 2002-12-09T09:46:18-08:00 (22 years, 6 months ago) by douglas
File size: 11976 byte(s)
Log Message:
Figured out and fixed fred problems.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Processor
46 //
47 // Douglas Thrift
48 //
49 // Processor.cpp
50
51 #include "Processor.h"
52
53 Processor::Processor()
54 {
55 page = new Page();
56 }
57
58 Processor::~Processor()
59 {
60 delete page;
61 }
62
63 bool Processor::process(HttpHandler& http, URL& url)
64 {
65 string title, description, text;
66 vector<string> headings;
67
68 if (html(http))
69 {
70 if (!process(http, url, title, description, text, headings)) return
71 false;
72
73 entities(title, "&nbsp;", ' ');
74 entities(title, "&lt;", '<');
75 entities(title, "&gt;", '>');
76 entities(title, "&quot;", '\"');
77 entities(title, "&amp;", '&');
78
79 entities(description, "&nbsp;", ' ');
80 entities(description, "&lt;", '<');
81 entities(description, "&gt;", '>');
82 entities(description, "&quot;", '\"');
83 entities(description, "&amp;", '&');
84
85 entities(text, "&nbsp;", ' ');
86 entities(text, "&lt;", '<');
87 entities(text, "&gt;", '>');
88 entities(text, "&quot;", '\"');
89 entities(text, "&amp;", '&');
90
91 for (int index = 0; index < headings.size(); index++)
92 {
93 entities(headings[index], "&nbsp;", ' ');
94 entities(headings[index], "&lt;", '<');
95 entities(headings[index], "&gt;", '>');
96 entities(headings[index], "&quot;", '\"');
97 entities(headings[index], "&amp;", '&');
98 }
99
100 normalize(title);
101 normalize(description);
102 normalize(text);
103 for (int index0 = 0; index0 < headings.size(); index0++)
104 {
105 normalize(headings[index0]);
106 }
107 }
108 else
109 {
110 bool knowSize = page->getSize() > 0;
111
112 string line;
113 while (http.good())
114 {
115 http.getline(line);
116
117 text += line + "\n";
118
119 if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
120 }
121
122 normalize(text);
123 }
124
125 page->setURL(url);
126 page->setTitle(title);
127 page->setDescription(description);
128 page->setText(text);
129 page->setHeadings(headings);
130
131 return true;
132 }
133
134 void Processor::reset()
135 {
136 links.clear();
137 delete page;
138 page = new Page();
139 }
140
141 bool Processor::process(HttpHandler& http, URL& url, string& title, string&
142 description, string& text, vector<string>& headings)
143 {
144 bool inHtml = false, inHead = false, inTitle = false, inBody = false,
145 inHeading = false, inComment = false, knowSize = page->getSize() > 0,
146 follow = true, answer = true;
147 unsigned startComment = 0, finishComment = 0;
148 string line;
149 while (http.good())
150 {
151 http.getline(line);
152 string heading;
153
154 unsigned begin = 0;
155 while (begin < line.length())
156 {
157 unsigned open = line.find('<', begin);
158 unsigned close = line.find('>', begin);
159
160 string next;
161 while (close == string::npos)
162 {
163 http.getline(next);
164 line += '\n' + next;
165 close = line.find('>', begin);
166 }
167
168 // strangely this is necessary sometimes
169 if (open == string::npos) open = line.find('<', begin);
170
171 string between = line.substr(begin, open - begin);
172 string tag = getTag(line, open, close);
173 string lowerTag(tag.length(), ' ');
174
175 for (unsigned index = 0; index < tag.length(); index++)
176 {
177 lowerTag[index] = tolower(tag[index]);
178 }
179
180 if (inHtml && !inComment)
181 {
182 if (inHead && inTitle)
183 {
184 title += between + "\n";
185 }
186
187 if (inBody)
188 {
189 text += between + "\n";
190 }
191
192 if (inBody && inHeading)
193 {
194 heading += between + "\n";
195 }
196 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
197 == 0) || (lowerTag.find("meta ") == 0)) && inHead)
198 {
199 if (lowerTag.find("name=robots") != string::npos ||
200 lowerTag.find("name=\"robots\"") != string::npos)
201 {
202 unsigned start = lowerTag.find("content=\"") + 9;
203 unsigned finish = lowerTag.find('\"', start);
204
205 string robots = lowerTag.substr(start, finish - start);
206
207 if ((robots.find("noindex") != string::npos &&
208 robots.find("nofollow") != string::npos) ||
209 robots.find("none") != string::npos)
210 {
211 answer = false;
212 follow = false;
213 links.clear();
214
215 return answer;
216 }
217 else if (robots.find("noindex") != string::npos)
218 {
219 answer = false;
220 }
221 else if (robots.find("nofollow") != string::npos)
222 {
223 follow = false;
224 links.clear();
225 }
226 }
227 else if (lowerTag.find("name=description") != string::npos
228 || lowerTag.find("name=\"description\"") !=
229 string::npos)
230 {
231 unsigned start = lowerTag.find("content=\"") + 9;
232 unsigned finish = lowerTag.find('\"', start);
233
234 description = tag.substr(start, finish - start);
235 }
236 }
237
238 if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
239 || (lowerTag.find("a ") == 0)) && inBody && follow)
240 {
241 if (lowerTag.find("href=\"") != string::npos)
242 {
243 unsigned start = lowerTag.find("href=\"") + 6;
244 unsigned finish = lowerTag.find('\"', start);
245
246 string link = getLink(tag.substr(start, finish -
247 start), url);
248
249 if (link != "bad link") links.insert(link);
250 }
251 else if (lowerTag.find("href=") != string::npos)
252 {
253 unsigned start = lowerTag.find("href=") + 5;
254 unsigned finish = lowerTag.find(' ', start);
255
256 if (finish < close)
257 {
258 string link = getLink(tag.substr(start, finish -
259 start), url);
260
261 if (link != "bad link") links.insert(link);
262 }
263 else
264 {
265 string link = getLink(tag.substr(start, close -
266 start), url);
267
268 if (link != "bad link") links.insert(link);
269 }
270 }
271 }
272
273 if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
274 0) || (lowerTag.find("img ")) && inBody)
275 {
276 if (lowerTag.find("alt=\"") != string::npos)
277 {
278 unsigned start = lowerTag.find("alt=\"") + 5;
279 unsigned finish = lowerTag.find('\"', start);
280
281 text += tag.substr(start, finish - start) + ' ';
282 if (inHeading) heading += tag.substr(start, finish -
283 start) + ' ';
284 }
285 else if (lowerTag.find("alt=") != string::npos)
286 {
287 unsigned start = lowerTag.find("alt=") + 4;
288 unsigned finish = lowerTag.find(' ', start);
289
290 if (finish < close)
291 {
292 text += tag.substr(start, finish - start) + ' ';
293 if (inHeading) heading += tag.substr(start, finish
294 - start) + ' ';
295 }
296 else
297 {
298 text += tag.substr(start, close - start) + ' ';
299 if (inHeading) heading += tag.substr(start, close -
300 start) + ' ';
301 }
302 }
303 }
304 }
305
306 if (lowerTag.find("html") == 0) inHtml = true;
307 if (lowerTag.find("/html") == 0) inHtml = false;
308
309 if (lowerTag.find("head") == 0) inHead = true;
310 if (lowerTag.find("/head") == 0) inHead = false;
311
312 if (lowerTag.find("title") == 0) inTitle = true;
313 if (lowerTag.find("/title") == 0) inTitle = false;
314
315 if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
316 inBody = true;
317 if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
318 inBody = false;
319
320 if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
321 lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
322 lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
323 inHeading = true;
324 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
325 lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
326 lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
327 {
328 if (heading != "") headings.push_back(heading);
329 inHeading = false;
330 }
331
332 if (lowerTag.find("!--") == 0)
333 {
334 startComment = open;
335 inComment = true;
336 }
337 if (line.find("-->", begin) >= startComment && line.find("-->",
338 begin) != string::npos)
339 {
340 finishComment = line.find("-->", begin) + 3;
341 inComment = false;
342 }
343
344 if (close == string::npos)
345 {
346 begin = close;
347 }
348 else
349 {
350 begin = close + 1;
351 }
352 }
353
354 startComment = 0;
355 finishComment = 0;
356
357 if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
358 }
359
360 return answer;
361 }
362
363 bool Processor::html(HttpHandler& http)
364 {
365 bool answer = false;
366
367 string line;
368 http.getline(line);
369
370 while (http.good())
371 {
372 string field;
373 http.getline(field, ' ');
374 if (field == "") break;
375 http.getline(line);
376
377 if (field == "Content-Type:" || field == "Content-type:")
378 {
379 if (line.find("text/html") != string::npos)
380 {
381 answer = true;
382 }
383 }
384
385 if (field == "Content-Length:" || field == "Content-length:")
386 {
387 page->setSize(strtoul(line.c_str(), 0, 0));
388 }
389 }
390
391 return answer;
392 }
393
394 string Processor::getTag(const string& line, unsigned open, unsigned close)
395 {
396 string tag = line.substr(open + 1, close - open - 1);
397
398 return tag;
399 }
400
401 string Processor::getLink(string link, URL& url)
402 {
403 string hyperlink = "bad link";
404
405 if (link.find('#') != string::npos)
406 {
407 unsigned pound = link.find('#');
408 link.erase(pound);
409 }
410
411 if (link.find("://") != string::npos)
412 {
413 if (link.find("http://") == 0) hyperlink = link;
414 }
415 else if (link.find("mailto:") == 0)
416 {
417 // do nothing we are not evil spammers!
418 }
419 else if (link.find("//") == 0)
420 {
421 hyperlink = "http:" + link;
422 }
423 else if (link.find('/') == 0)
424 {
425 hyperlink = url.getURL();
426
427 unsigned path = hyperlink.find('/', 7);
428 hyperlink.erase(path);
429
430 hyperlink += link;
431 }
432 else if (link == "")
433 {
434 // a blank link is useless
435 }
436 else
437 {
438 hyperlink = url.getURL();
439 string path = url.getPath();
440
441 unsigned cutoff = hyperlink.rfind(path);
442 hyperlink.erase(cutoff);
443
444 unsigned dir = path.rfind('/') + 1;
445 path.erase(dir);
446
447 while (link.find("../") == 0)
448 {
449 unsigned dot = path.rfind('/') - 1;
450 unsigned up = path.rfind('/', dot) + 1;
451
452 path.erase(up);
453 link.erase(0, 3);
454 }
455 while (link.find("./") == 0)
456 {
457 link.erase(0, 2);
458 }
459
460 hyperlink += path + link;
461 }
462
463 return hyperlink;
464 }