ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 334
Committed: 2004-04-05T16:37:41-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 10138 byte(s)
Log Message:
Ah, I just love Subversion!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Processor
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Processor.hpp"
52 douglas 1
53     Processor::Processor()
54     {
55     page = new Page();
56     }
57    
58     Processor::~Processor()
59     {
60     delete page;
61     }
62    
63     bool Processor::process(HttpHandler& http, URL& url)
64     {
65     string title, description, text;
66     vector<string> headings;
67    
68 douglas 17 if (http.contentType().find("text/html") == 0)
69 douglas 1 {
70     if (!process(http, url, title, description, text, headings)) return
71     false;
72    
73     entities(title, "&nbsp;", ' ');
74     entities(title, "&lt;", '<');
75     entities(title, "&gt;", '>');
76     entities(title, "&quot;", '\"');
77     entities(title, "&amp;", '&');
78     entities(description, "&nbsp;", ' ');
79     entities(description, "&lt;", '<');
80     entities(description, "&gt;", '>');
81     entities(description, "&quot;", '\"');
82     entities(description, "&amp;", '&');
83     entities(text, "&nbsp;", ' ');
84     entities(text, "&lt;", '<');
85     entities(text, "&gt;", '>');
86     entities(text, "&quot;", '\"');
87     entities(text, "&amp;", '&');
88    
89     for (int index = 0; index < headings.size(); index++)
90     {
91     entities(headings[index], "&nbsp;", ' ');
92     entities(headings[index], "&lt;", '<');
93     entities(headings[index], "&gt;", '>');
94     entities(headings[index], "&quot;", '\"');
95     entities(headings[index], "&amp;", '&');
96     }
97    
98     normalize(title);
99     normalize(description);
100     normalize(text);
101 douglas 316
102 douglas 1 for (int index0 = 0; index0 < headings.size(); index0++)
103     {
104     normalize(headings[index0]);
105     }
106     }
107     else
108     {
109     string line;
110 douglas 316
111 douglas 1 while (http.good())
112     {
113     http.getline(line);
114    
115     text += line + "\n";
116     }
117    
118     normalize(text);
119     }
120    
121 douglas 17 page->setSize(http.contentLength());
122 douglas 1 page->setURL(url);
123     page->setTitle(title);
124     page->setDescription(description);
125     page->setText(text);
126     page->setHeadings(headings);
127    
128     return true;
129     }
130    
131     void Processor::reset()
132     {
133     links.clear();
134 douglas 316
135 douglas 1 delete page;
136 douglas 316
137 douglas 1 page = new Page();
138     }
139    
140     bool Processor::process(HttpHandler& http, URL& url, string& title, string&
141     description, string& text, vector<string>& headings)
142     {
143     bool inHtml = false, inHead = false, inTitle = false, inBody = false,
144 douglas 19 inHeading = false, inComment = false, follow = true, answer = true;
145 douglas 315 string line, heading;
146    
147 douglas 1 while (http.good())
148     {
149     http.getline(line);
150    
151 douglas 316 unsigned begin = 0, startComment = 0;
152    
153 douglas 1 while (begin < line.length())
154     {
155     unsigned open = line.find('<', begin);
156     unsigned close = line.find('>', begin);
157 douglas 316 string next;
158 douglas 1
159 douglas 21 while (close == string::npos && http.good())
160 douglas 1 {
161     http.getline(next);
162     line += '\n' + next;
163     close = line.find('>', begin);
164     }
165    
166     // strangely this is necessary sometimes
167     if (open == string::npos) open = line.find('<', begin);
168    
169     string between = line.substr(begin, open - begin);
170     string tag = getTag(line, open, close);
171     string lowerTag(tag.length(), ' ');
172    
173     for (unsigned index = 0; index < tag.length(); index++)
174     {
175     lowerTag[index] = tolower(tag[index]);
176     }
177    
178     if (inHtml && !inComment)
179     {
180     if (inHead && inTitle)
181     {
182     title += between + "\n";
183     }
184    
185     if (inBody)
186     {
187     text += between + "\n";
188     }
189    
190     if (inBody && inHeading)
191     {
192     heading += between + "\n";
193     }
194 douglas 316
195 douglas 1 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
196     == 0) || (lowerTag.find("meta ") == 0)) && inHead)
197     {
198     if (lowerTag.find("name=robots") != string::npos ||
199     lowerTag.find("name=\"robots\"") != string::npos)
200     {
201     unsigned start = lowerTag.find("content=\"") + 9;
202     unsigned finish = lowerTag.find('\"', start);
203    
204     string robots = lowerTag.substr(start, finish - start);
205    
206     if ((robots.find("noindex") != string::npos &&
207     robots.find("nofollow") != string::npos) ||
208     robots.find("none") != string::npos)
209     {
210     answer = false;
211     follow = false;
212     links.clear();
213    
214     return answer;
215     }
216     else if (robots.find("noindex") != string::npos)
217     {
218     answer = false;
219     }
220     else if (robots.find("nofollow") != string::npos)
221     {
222     follow = false;
223     links.clear();
224     }
225     }
226     else if (lowerTag.find("name=description") != string::npos
227     || lowerTag.find("name=\"description\"") !=
228     string::npos)
229     {
230     unsigned start = lowerTag.find("content=\"") + 9;
231     unsigned finish = lowerTag.find('\"', start);
232    
233     description = tag.substr(start, finish - start);
234     }
235     }
236    
237     if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
238     || (lowerTag.find("a ") == 0)) && inBody && follow)
239     {
240     if (lowerTag.find("href=\"") != string::npos)
241     {
242     unsigned start = lowerTag.find("href=\"") + 6;
243     unsigned finish = lowerTag.find('\"', start);
244    
245 douglas 15 string link = getLink(tag.substr(start, finish -
246     start), url);
247 douglas 1
248 douglas 17 if (link != "") links.insert(link);
249 douglas 1 }
250     else if (lowerTag.find("href=") != string::npos)
251     {
252     unsigned start = lowerTag.find("href=") + 5;
253     unsigned finish = lowerTag.find(' ', start);
254    
255     if (finish < close)
256     {
257 douglas 15 string link = getLink(tag.substr(start, finish -
258     start), url);
259 douglas 1
260 douglas 17 if (link != "") links.insert(link);
261 douglas 1 }
262     else
263     {
264 douglas 15 string link = getLink(tag.substr(start, close -
265     start), url);
266 douglas 1
267 douglas 17 if (link != "") links.insert(link);
268 douglas 1 }
269     }
270     }
271    
272     if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
273     0) || (lowerTag.find("img ")) && inBody)
274     {
275     if (lowerTag.find("alt=\"") != string::npos)
276     {
277     unsigned start = lowerTag.find("alt=\"") + 5;
278     unsigned finish = lowerTag.find('\"', start);
279    
280     text += tag.substr(start, finish - start) + ' ';
281     if (inHeading) heading += tag.substr(start, finish -
282     start) + ' ';
283     }
284     else if (lowerTag.find("alt=") != string::npos)
285     {
286     unsigned start = lowerTag.find("alt=") + 4;
287     unsigned finish = lowerTag.find(' ', start);
288    
289     if (finish < close)
290     {
291     text += tag.substr(start, finish - start) + ' ';
292     if (inHeading) heading += tag.substr(start, finish
293     - start) + ' ';
294     }
295     else
296     {
297     text += tag.substr(start, close - start) + ' ';
298     if (inHeading) heading += tag.substr(start, close -
299     start) + ' ';
300     }
301     }
302     }
303     }
304    
305     if (lowerTag.find("html") == 0) inHtml = true;
306     if (lowerTag.find("/html") == 0) inHtml = false;
307     if (lowerTag.find("head") == 0) inHead = true;
308     if (lowerTag.find("/head") == 0) inHead = false;
309     if (lowerTag.find("title") == 0) inTitle = true;
310     if (lowerTag.find("/title") == 0) inTitle = false;
311     if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
312     inBody = true;
313     if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
314     inBody = false;
315    
316     if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
317     lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
318     lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
319 douglas 315 {
320     heading = "";
321 douglas 1 inHeading = true;
322 douglas 315 }
323 douglas 316
324 douglas 1 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
325     lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
326     lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
327     {
328     if (heading != "") headings.push_back(heading);
329 douglas 316
330 douglas 1 inHeading = false;
331     }
332    
333     if (lowerTag.find("!--") == 0)
334     {
335     startComment = open;
336     inComment = true;
337     }
338 douglas 316
339 douglas 1 if (line.find("-->", begin) >= startComment && line.find("-->",
340     begin) != string::npos)
341     {
342 douglas 316 close = line.find("-->", begin) + 3;
343 douglas 1 inComment = false;
344     }
345    
346     if (close == string::npos)
347     {
348     begin = close;
349     }
350     else
351     {
352     begin = close + 1;
353     }
354     }
355     }
356    
357     return answer;
358     }
359    
360 douglas 15 string Processor::getTag(const string& line, unsigned open, unsigned close)
361 douglas 1 {
362     string tag = line.substr(open + 1, close - open - 1);
363    
364     return tag;
365     }

Properties

Name Value
svn:eol-style native
svn:keywords Id