ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 15
Committed: 2002-12-09T09:46:18-08:00 (22 years, 6 months ago) by douglas
File size: 11976 byte(s)
Log Message:
Figured out and fixed fred problems.

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Processor
46     //
47     // Douglas Thrift
48     //
49     // Processor.cpp
50    
51     #include "Processor.h"
52    
53     Processor::Processor()
54     {
55     page = new Page();
56     }
57    
58     Processor::~Processor()
59     {
60     delete page;
61     }
62    
63     bool Processor::process(HttpHandler& http, URL& url)
64     {
65     string title, description, text;
66     vector<string> headings;
67    
68     if (html(http))
69     {
70     if (!process(http, url, title, description, text, headings)) return
71     false;
72    
73     entities(title, "&nbsp;", ' ');
74     entities(title, "&lt;", '<');
75     entities(title, "&gt;", '>');
76     entities(title, "&quot;", '\"');
77     entities(title, "&amp;", '&');
78    
79     entities(description, "&nbsp;", ' ');
80     entities(description, "&lt;", '<');
81     entities(description, "&gt;", '>');
82     entities(description, "&quot;", '\"');
83     entities(description, "&amp;", '&');
84    
85     entities(text, "&nbsp;", ' ');
86     entities(text, "&lt;", '<');
87     entities(text, "&gt;", '>');
88     entities(text, "&quot;", '\"');
89     entities(text, "&amp;", '&');
90    
91     for (int index = 0; index < headings.size(); index++)
92     {
93     entities(headings[index], "&nbsp;", ' ');
94     entities(headings[index], "&lt;", '<');
95     entities(headings[index], "&gt;", '>');
96     entities(headings[index], "&quot;", '\"');
97     entities(headings[index], "&amp;", '&');
98     }
99    
100     normalize(title);
101     normalize(description);
102     normalize(text);
103     for (int index0 = 0; index0 < headings.size(); index0++)
104     {
105     normalize(headings[index0]);
106     }
107     }
108     else
109     {
110     bool knowSize = page->getSize() > 0;
111    
112     string line;
113     while (http.good())
114     {
115     http.getline(line);
116    
117     text += line + "\n";
118    
119     if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
120     }
121    
122     normalize(text);
123     }
124    
125     page->setURL(url);
126     page->setTitle(title);
127     page->setDescription(description);
128     page->setText(text);
129     page->setHeadings(headings);
130    
131     return true;
132     }
133    
134     void Processor::reset()
135     {
136     links.clear();
137     delete page;
138     page = new Page();
139     }
140    
141     bool Processor::process(HttpHandler& http, URL& url, string& title, string&
142     description, string& text, vector<string>& headings)
143     {
144     bool inHtml = false, inHead = false, inTitle = false, inBody = false,
145     inHeading = false, inComment = false, knowSize = page->getSize() > 0,
146     follow = true, answer = true;
147     unsigned startComment = 0, finishComment = 0;
148     string line;
149     while (http.good())
150     {
151     http.getline(line);
152     string heading;
153    
154     unsigned begin = 0;
155     while (begin < line.length())
156     {
157     unsigned open = line.find('<', begin);
158     unsigned close = line.find('>', begin);
159    
160     string next;
161     while (close == string::npos)
162     {
163     http.getline(next);
164     line += '\n' + next;
165     close = line.find('>', begin);
166     }
167    
168     // strangely this is necessary sometimes
169     if (open == string::npos) open = line.find('<', begin);
170    
171     string between = line.substr(begin, open - begin);
172     string tag = getTag(line, open, close);
173     string lowerTag(tag.length(), ' ');
174    
175     for (unsigned index = 0; index < tag.length(); index++)
176     {
177     lowerTag[index] = tolower(tag[index]);
178     }
179    
180     if (inHtml && !inComment)
181     {
182     if (inHead && inTitle)
183     {
184     title += between + "\n";
185     }
186    
187     if (inBody)
188     {
189     text += between + "\n";
190     }
191    
192     if (inBody && inHeading)
193     {
194     heading += between + "\n";
195     }
196     if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
197     == 0) || (lowerTag.find("meta ") == 0)) && inHead)
198     {
199     if (lowerTag.find("name=robots") != string::npos ||
200     lowerTag.find("name=\"robots\"") != string::npos)
201     {
202     unsigned start = lowerTag.find("content=\"") + 9;
203     unsigned finish = lowerTag.find('\"', start);
204    
205     string robots = lowerTag.substr(start, finish - start);
206    
207     if ((robots.find("noindex") != string::npos &&
208     robots.find("nofollow") != string::npos) ||
209     robots.find("none") != string::npos)
210     {
211     answer = false;
212     follow = false;
213     links.clear();
214    
215     return answer;
216     }
217     else if (robots.find("noindex") != string::npos)
218     {
219     answer = false;
220     }
221     else if (robots.find("nofollow") != string::npos)
222     {
223     follow = false;
224     links.clear();
225     }
226     }
227     else if (lowerTag.find("name=description") != string::npos
228     || lowerTag.find("name=\"description\"") !=
229     string::npos)
230     {
231     unsigned start = lowerTag.find("content=\"") + 9;
232     unsigned finish = lowerTag.find('\"', start);
233    
234     description = tag.substr(start, finish - start);
235     }
236     }
237    
238     if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
239     || (lowerTag.find("a ") == 0)) && inBody && follow)
240     {
241     if (lowerTag.find("href=\"") != string::npos)
242     {
243     unsigned start = lowerTag.find("href=\"") + 6;
244     unsigned finish = lowerTag.find('\"', start);
245    
246 douglas 15 string link = getLink(tag.substr(start, finish -
247     start), url);
248 douglas 1
249     if (link != "bad link") links.insert(link);
250     }
251     else if (lowerTag.find("href=") != string::npos)
252     {
253     unsigned start = lowerTag.find("href=") + 5;
254     unsigned finish = lowerTag.find(' ', start);
255    
256     if (finish < close)
257     {
258 douglas 15 string link = getLink(tag.substr(start, finish -
259     start), url);
260 douglas 1
261     if (link != "bad link") links.insert(link);
262     }
263     else
264     {
265 douglas 15 string link = getLink(tag.substr(start, close -
266     start), url);
267 douglas 1
268     if (link != "bad link") links.insert(link);
269     }
270     }
271     }
272    
273     if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
274     0) || (lowerTag.find("img ")) && inBody)
275     {
276     if (lowerTag.find("alt=\"") != string::npos)
277     {
278     unsigned start = lowerTag.find("alt=\"") + 5;
279     unsigned finish = lowerTag.find('\"', start);
280    
281     text += tag.substr(start, finish - start) + ' ';
282     if (inHeading) heading += tag.substr(start, finish -
283     start) + ' ';
284     }
285     else if (lowerTag.find("alt=") != string::npos)
286     {
287     unsigned start = lowerTag.find("alt=") + 4;
288     unsigned finish = lowerTag.find(' ', start);
289    
290     if (finish < close)
291     {
292     text += tag.substr(start, finish - start) + ' ';
293     if (inHeading) heading += tag.substr(start, finish
294     - start) + ' ';
295     }
296     else
297     {
298     text += tag.substr(start, close - start) + ' ';
299     if (inHeading) heading += tag.substr(start, close -
300     start) + ' ';
301     }
302     }
303     }
304     }
305    
306     if (lowerTag.find("html") == 0) inHtml = true;
307     if (lowerTag.find("/html") == 0) inHtml = false;
308    
309     if (lowerTag.find("head") == 0) inHead = true;
310     if (lowerTag.find("/head") == 0) inHead = false;
311    
312     if (lowerTag.find("title") == 0) inTitle = true;
313     if (lowerTag.find("/title") == 0) inTitle = false;
314    
315     if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
316     inBody = true;
317     if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
318     inBody = false;
319    
320     if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
321     lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
322     lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
323     inHeading = true;
324     if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
325     lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
326     lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
327     {
328     if (heading != "") headings.push_back(heading);
329     inHeading = false;
330     }
331    
332     if (lowerTag.find("!--") == 0)
333     {
334     startComment = open;
335     inComment = true;
336     }
337     if (line.find("-->", begin) >= startComment && line.find("-->",
338     begin) != string::npos)
339     {
340     finishComment = line.find("-->", begin) + 3;
341     inComment = false;
342     }
343    
344     if (close == string::npos)
345     {
346     begin = close;
347     }
348     else
349     {
350     begin = close + 1;
351     }
352     }
353    
354     startComment = 0;
355     finishComment = 0;
356    
357     if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
358     }
359    
360     return answer;
361     }
362    
363     bool Processor::html(HttpHandler& http)
364     {
365     bool answer = false;
366    
367     string line;
368     http.getline(line);
369    
370     while (http.good())
371     {
372     string field;
373     http.getline(field, ' ');
374     if (field == "") break;
375     http.getline(line);
376    
377     if (field == "Content-Type:" || field == "Content-type:")
378     {
379     if (line.find("text/html") != string::npos)
380     {
381     answer = true;
382     }
383     }
384    
385     if (field == "Content-Length:" || field == "Content-length:")
386     {
387     page->setSize(strtoul(line.c_str(), 0, 0));
388     }
389     }
390    
391     return answer;
392     }
393    
394 douglas 15 string Processor::getTag(const string& line, unsigned open, unsigned close)
395 douglas 1 {
396     string tag = line.substr(open + 1, close - open - 1);
397    
398     return tag;
399     }
400    
401 douglas 15 string Processor::getLink(string link, URL& url)
402 douglas 1 {
403     string hyperlink = "bad link";
404    
405     if (link.find('#') != string::npos)
406     {
407     unsigned pound = link.find('#');
408     link.erase(pound);
409     }
410    
411     if (link.find("://") != string::npos)
412     {
413     if (link.find("http://") == 0) hyperlink = link;
414     }
415     else if (link.find("mailto:") == 0)
416     {
417     // do nothing we are not evil spammers!
418     }
419     else if (link.find("//") == 0)
420     {
421     hyperlink = "http:" + link;
422     }
423     else if (link.find('/') == 0)
424     {
425     hyperlink = url.getURL();
426    
427     unsigned path = hyperlink.find('/', 7);
428     hyperlink.erase(path);
429    
430     hyperlink += link;
431     }
432     else if (link == "")
433     {
434     // a blank link is useless
435     }
436     else
437     {
438     hyperlink = url.getURL();
439     string path = url.getPath();
440    
441     unsigned cutoff = hyperlink.rfind(path);
442     hyperlink.erase(cutoff);
443    
444     unsigned dir = path.rfind('/') + 1;
445     path.erase(dir);
446    
447     while (link.find("../") == 0)
448     {
449     unsigned dot = path.rfind('/') - 1;
450     unsigned up = path.rfind('/', dot) + 1;
451    
452     path.erase(up);
453     link.erase(0, 3);
454     }
455     while (link.find("./") == 0)
456     {
457     link.erase(0, 2);
458     }
459    
460     hyperlink += path + link;
461     }
462    
463     return hyperlink;
464     }