ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 355
Committed: 2004-06-04T04:08:28-07:00 (21 years ago) by Douglas Thrift
File size: 9803 byte(s)
Log Message:
I missed some C++ifying!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Processor
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Processor.hpp"
52 douglas 1
53     bool Processor::process(HttpHandler& http, URL& url)
54     {
55     string title, description, text;
56     vector<string> headings;
57    
58 douglas 17 if (http.contentType().find("text/html") == 0)
59 douglas 1 {
60     if (!process(http, url, title, description, text, headings)) return
61     false;
62    
63     entities(title, "&nbsp;", ' ');
64     entities(title, "&lt;", '<');
65     entities(title, "&gt;", '>');
66     entities(title, "&quot;", '\"');
67     entities(title, "&amp;", '&');
68     entities(description, "&nbsp;", ' ');
69     entities(description, "&lt;", '<');
70     entities(description, "&gt;", '>');
71     entities(description, "&quot;", '\"');
72     entities(description, "&amp;", '&');
73     entities(text, "&nbsp;", ' ');
74     entities(text, "&lt;", '<');
75     entities(text, "&gt;", '>');
76     entities(text, "&quot;", '\"');
77     entities(text, "&amp;", '&');
78    
79 Douglas Thrift 348 for (size_t index(0); index < headings.size(); index++)
80 douglas 1 {
81     entities(headings[index], "&nbsp;", ' ');
82     entities(headings[index], "&lt;", '<');
83     entities(headings[index], "&gt;", '>');
84     entities(headings[index], "&quot;", '\"');
85     entities(headings[index], "&amp;", '&');
86     }
87    
88     normalize(title);
89     normalize(description);
90     normalize(text);
91 douglas 316
92 Douglas Thrift 348 for (size_t index0(0); index0 < headings.size(); index0++)
93 douglas 1 {
94     normalize(headings[index0]);
95     }
96     }
97     else
98     {
99     string line;
100 douglas 316
101 douglas 1 while (http.good())
102     {
103     http.getline(line);
104    
105     text += line + "\n";
106     }
107    
108     normalize(text);
109     }
110    
111 Douglas Thrift 348 page.setSize(http.contentLength());
112     page.setURL(url);
113     page.setTitle(title);
114     page.setDescription(description);
115     page.setText(text);
116     page.setHeadings(headings);
117 douglas 1
118     return true;
119     }
120    
121     void Processor::reset()
122     {
123     links.clear();
124 douglas 316
125 Douglas Thrift 348 page = Page();
126 douglas 1 }
127    
128     bool Processor::process(HttpHandler& http, URL& url, string& title, string&
129     description, string& text, vector<string>& headings)
130     {
131 Douglas Thrift 348 bool inHtml(false), inHead(false), inTitle(false), inBody(false),
132     inHeading(false), inComment(false), follow(true), answer(true);
133 douglas 315 string line, heading;
134    
135 douglas 1 while (http.good())
136     {
137     http.getline(line);
138    
139 Douglas Thrift 348 unsigned begin(0), startComment(0);
140 douglas 316
141 douglas 1 while (begin < line.length())
142     {
143 Douglas Thrift 348 unsigned open(line.find('<', begin)), close(line.find('>', begin));
144 douglas 316 string next;
145 douglas 1
146 douglas 21 while (close == string::npos && http.good())
147 douglas 1 {
148     http.getline(next);
149 Douglas Thrift 348
150 douglas 1 line += '\n' + next;
151 Douglas Thrift 348
152 douglas 1 close = line.find('>', begin);
153     }
154    
155     // strangely this is necessary sometimes
156     if (open == string::npos) open = line.find('<', begin);
157    
158 Douglas Thrift 348 string between(line.substr(begin, open - begin)), tag(getTag(line,
159     open, close)), lowerTag(tolower(tag));
160 douglas 1
161     if (inHtml && !inComment)
162     {
163     if (inHead && inTitle)
164     {
165 Douglas Thrift 348 title += between + '\n';
166 douglas 1 }
167    
168     if (inBody)
169     {
170 Douglas Thrift 348 text += between + '\n';
171 douglas 1 }
172    
173     if (inBody && inHeading)
174     {
175 Douglas Thrift 348 heading += between + '\n';
176 douglas 1 }
177 douglas 316
178 douglas 1 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
179     == 0) || (lowerTag.find("meta ") == 0)) && inHead)
180     {
181     if (lowerTag.find("name=robots") != string::npos ||
182     lowerTag.find("name=\"robots\"") != string::npos)
183     {
184 Douglas Thrift 348 unsigned start(lowerTag.find("content=\"") + 9),
185     finish(lowerTag.find('\"', start));
186     string robots(lowerTag.substr(start, finish - start));
187 douglas 1
188     if ((robots.find("noindex") != string::npos &&
189     robots.find("nofollow") != string::npos) ||
190     robots.find("none") != string::npos)
191     {
192     answer = false;
193     follow = false;
194 Douglas Thrift 348
195 douglas 1 links.clear();
196    
197     return answer;
198     }
199     else if (robots.find("noindex") != string::npos)
200     {
201     answer = false;
202     }
203     else if (robots.find("nofollow") != string::npos)
204     {
205     follow = false;
206 Douglas Thrift 348
207 douglas 1 links.clear();
208     }
209     }
210     else if (lowerTag.find("name=description") != string::npos
211     || lowerTag.find("name=\"description\"") !=
212     string::npos)
213     {
214 Douglas Thrift 348 unsigned start(lowerTag.find("content=\"") + 9),
215     finish(lowerTag.find('\"', start));
216 douglas 1
217     description = tag.substr(start, finish - start);
218     }
219     }
220    
221     if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
222     || (lowerTag.find("a ") == 0)) && inBody && follow)
223     {
224     if (lowerTag.find("href=\"") != string::npos)
225     {
226 Douglas Thrift 348 unsigned start(lowerTag.find("href=\"") + 6),
227     finish(lowerTag.find('\"', start));
228 Douglas Thrift 355 string link(getLink(tag.substr(start, finish -
229     start), url));
230 douglas 1
231 Douglas Thrift 355 if (!link.empty()) links.insert(link);
232 douglas 1 }
233     else if (lowerTag.find("href=") != string::npos)
234     {
235 Douglas Thrift 348 unsigned start(lowerTag.find("href=") + 5),
236     finish(lowerTag.find(' ', start));
237 douglas 1
238     if (finish < close)
239     {
240 Douglas Thrift 355 string link(getLink(tag.substr(start, finish -
241     start), url));
242 douglas 1
243 Douglas Thrift 355 if (!link.empty()) links.insert(link);
244 douglas 1 }
245     else
246     {
247 Douglas Thrift 355 string link(getLink(tag.substr(start, close -
248     start), url));
249 douglas 1
250 Douglas Thrift 355 if (!link.empty()) links.insert(link);
251 douglas 1 }
252     }
253     }
254    
255     if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
256     0) || (lowerTag.find("img ")) && inBody)
257     {
258     if (lowerTag.find("alt=\"") != string::npos)
259     {
260 Douglas Thrift 348 unsigned start(lowerTag.find("alt=\"") + 5),
261     finish(lowerTag.find('\"', start));
262 douglas 1
263     text += tag.substr(start, finish - start) + ' ';
264 Douglas Thrift 348
265 douglas 1 if (inHeading) heading += tag.substr(start, finish -
266     start) + ' ';
267     }
268     else if (lowerTag.find("alt=") != string::npos)
269     {
270 Douglas Thrift 348 unsigned start(lowerTag.find("alt=") + 4),
271     finish(lowerTag.find(' ', start));
272 douglas 1
273     if (finish < close)
274     {
275     text += tag.substr(start, finish - start) + ' ';
276 Douglas Thrift 348
277 douglas 1 if (inHeading) heading += tag.substr(start, finish
278     - start) + ' ';
279     }
280     else
281     {
282     text += tag.substr(start, close - start) + ' ';
283 Douglas Thrift 348
284 douglas 1 if (inHeading) heading += tag.substr(start, close -
285     start) + ' ';
286     }
287     }
288     }
289     }
290    
291     if (lowerTag.find("html") == 0) inHtml = true;
292     if (lowerTag.find("/html") == 0) inHtml = false;
293     if (lowerTag.find("head") == 0) inHead = true;
294     if (lowerTag.find("/head") == 0) inHead = false;
295     if (lowerTag.find("title") == 0) inTitle = true;
296     if (lowerTag.find("/title") == 0) inTitle = false;
297     if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
298     inBody = true;
299     if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
300     inBody = false;
301    
302     if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
303     lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
304     lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
305 douglas 315 {
306 Douglas Thrift 355 heading.erase();
307    
308 douglas 1 inHeading = true;
309 douglas 315 }
310 douglas 316
311 douglas 1 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
312     lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
313     lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
314     {
315 Douglas Thrift 355 if (!heading.empty()) headings.push_back(heading);
316 douglas 316
317 douglas 1 inHeading = false;
318     }
319    
320     if (lowerTag.find("!--") == 0)
321     {
322     startComment = open;
323     inComment = true;
324     }
325 douglas 316
326 douglas 1 if (line.find("-->", begin) >= startComment && line.find("-->",
327     begin) != string::npos)
328     {
329 douglas 316 close = line.find("-->", begin) + 3;
330 douglas 1 inComment = false;
331     }
332    
333     if (close == string::npos)
334     {
335     begin = close;
336     }
337     else
338     {
339     begin = close + 1;
340     }
341     }
342     }
343    
344     return answer;
345     }
346    
347 douglas 15 string Processor::getTag(const string& line, unsigned open, unsigned close)
348 douglas 1 {
349 Douglas Thrift 348 return line.substr(open + 1, close - open - 1);
350 douglas 1 }

Properties

Name Value
svn:eol-style native
svn:keywords Id