ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 365
Committed: 2008-08-11T15:16:35-07:00 (16 years, 10 months ago) by douglas
File size: 10176 byte(s)
Log Message:
64bit cleanify!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Processor
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Processor.hpp"
52 douglas 1
53     bool Processor::process(HttpHandler& http, URL& url)
54     {
55     string title, description, text;
56     vector<string> headings;
57    
58 douglas 17 if (http.contentType().find("text/html") == 0)
59 douglas 1 {
60     if (!process(http, url, title, description, text, headings)) return
61     false;
62    
63     entities(title, "&nbsp;", ' ');
64     entities(title, "&lt;", '<');
65     entities(title, "&gt;", '>');
66     entities(title, "&quot;", '\"');
67     entities(title, "&amp;", '&');
68     entities(description, "&nbsp;", ' ');
69     entities(description, "&lt;", '<');
70     entities(description, "&gt;", '>');
71     entities(description, "&quot;", '\"');
72     entities(description, "&amp;", '&');
73     entities(text, "&nbsp;", ' ');
74     entities(text, "&lt;", '<');
75     entities(text, "&gt;", '>');
76     entities(text, "&quot;", '\"');
77     entities(text, "&amp;", '&');
78    
79 Douglas Thrift 348 for (size_t index(0); index < headings.size(); index++)
80 douglas 1 {
81     entities(headings[index], "&nbsp;", ' ');
82     entities(headings[index], "&lt;", '<');
83     entities(headings[index], "&gt;", '>');
84     entities(headings[index], "&quot;", '\"');
85     entities(headings[index], "&amp;", '&');
86     }
87    
88     normalize(title);
89     normalize(description);
90     normalize(text);
91 douglas 316
92 Douglas Thrift 348 for (size_t index0(0); index0 < headings.size(); index0++)
93 douglas 1 {
94     normalize(headings[index0]);
95     }
96     }
97     else
98     {
99     string line;
100 douglas 316
101 douglas 1 while (http.good())
102     {
103     http.getline(line);
104    
105     text += line + "\n";
106     }
107    
108     normalize(text);
109     }
110    
111 Douglas Thrift 348 page.setSize(http.contentLength());
112     page.setURL(url);
113     page.setTitle(title);
114     page.setDescription(description);
115     page.setText(text);
116     page.setHeadings(headings);
117 douglas 1
118     return true;
119     }
120    
121     void Processor::reset()
122     {
123     links.clear();
124 douglas 316
125 Douglas Thrift 348 page = Page();
126 douglas 1 }
127    
128     bool Processor::process(HttpHandler& http, URL& url, string& title, string&
129     description, string& text, vector<string>& headings)
130     {
131 Douglas Thrift 348 bool inHtml(false), inHead(false), inTitle(false), inBody(false),
132     inHeading(false), inComment(false), follow(true), answer(true);
133 douglas 315 string line, heading;
134    
135 douglas 1 while (http.good())
136     {
137     http.getline(line);
138    
139 douglas 365 size_t begin(0), startComment(0);
140 douglas 316
141 douglas 1 while (begin < line.length())
142     {
143 douglas 365 size_t open(line.find('<', begin)), close(line.find('>', begin));
144 douglas 316 string next;
145 douglas 1
146 douglas 21 while (close == string::npos && http.good())
147 douglas 1 {
148     http.getline(next);
149 Douglas Thrift 348
150 douglas 1 line += '\n' + next;
151 Douglas Thrift 348
152 douglas 1 close = line.find('>', begin);
153     }
154    
155     // strangely this is necessary sometimes
156     if (open == string::npos) open = line.find('<', begin);
157    
158 Douglas Thrift 348 string between(line.substr(begin, open - begin)), tag(getTag(line,
159     open, close)), lowerTag(tolower(tag));
160 douglas 1
161     if (inHtml && !inComment)
162     {
163     if (inHead && inTitle)
164     {
165 Douglas Thrift 348 title += between + '\n';
166 douglas 1 }
167    
168     if (inBody)
169     {
170 Douglas Thrift 348 text += between + '\n';
171 douglas 1 }
172    
173     if (inBody && inHeading)
174     {
175 Douglas Thrift 348 heading += between + '\n';
176 douglas 1 }
177 douglas 316
178 douglas 1 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
179     == 0) || (lowerTag.find("meta ") == 0)) && inHead)
180     {
181     if (lowerTag.find("name=robots") != string::npos ||
182 douglas 364 lowerTag.find("name=\"robots\"") != string::npos ||
183     lowerTag.find("name='robots'") != string::npos)
184 douglas 1 {
185 douglas 365 size_t start(lowerTag.find("content=\"") + 9),
186 Douglas Thrift 348 finish(lowerTag.find('\"', start));
187     string robots(lowerTag.substr(start, finish - start));
188 douglas 1
189     if ((robots.find("noindex") != string::npos &&
190     robots.find("nofollow") != string::npos) ||
191     robots.find("none") != string::npos)
192     {
193     answer = false;
194     follow = false;
195 Douglas Thrift 348
196 douglas 1 links.clear();
197    
198     return answer;
199     }
200     else if (robots.find("noindex") != string::npos)
201     {
202     answer = false;
203     }
204     else if (robots.find("nofollow") != string::npos)
205     {
206     follow = false;
207 Douglas Thrift 348
208 douglas 1 links.clear();
209     }
210     }
211     else if (lowerTag.find("name=description") != string::npos
212 douglas 364 || lowerTag.find("name=\"description\"") != string::npos
213     || lowerTag.find("name='description'") != string::npos)
214 douglas 1 {
215 douglas 365 size_t start(lowerTag.find("content=\"") + 9),
216 Douglas Thrift 348 finish(lowerTag.find('\"', start));
217 douglas 1
218     description = tag.substr(start, finish - start);
219     }
220     }
221    
222     if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
223     || (lowerTag.find("a ") == 0)) && inBody && follow)
224     {
225     if (lowerTag.find("href=\"") != string::npos)
226     {
227 douglas 365 size_t start(lowerTag.find("href=\"") + 6),
228 Douglas Thrift 348 finish(lowerTag.find('\"', start));
229 Douglas Thrift 355 string link(getLink(tag.substr(start, finish -
230     start), url));
231 douglas 1
232 Douglas Thrift 355 if (!link.empty()) links.insert(link);
233 douglas 1 }
234 douglas 364 else if (lowerTag.find("href='") != string::npos)
235     {
236 douglas 365 size_t start(lowerTag.find("href='") + 6),
237 douglas 364 finish(lowerTag.find('\'', start));
238     string link(getLink(tag.substr(start, finish -
239     start), url));
240    
241     if (!link.empty()) links.insert(link);
242     }
243 douglas 1 else if (lowerTag.find("href=") != string::npos)
244     {
245 douglas 365 size_t start(lowerTag.find("href=") + 5),
246 Douglas Thrift 348 finish(lowerTag.find(' ', start));
247 douglas 1
248     if (finish < close)
249     {
250 Douglas Thrift 355 string link(getLink(tag.substr(start, finish -
251     start), url));
252 douglas 1
253 Douglas Thrift 355 if (!link.empty()) links.insert(link);
254 douglas 1 }
255     else
256     {
257 Douglas Thrift 355 string link(getLink(tag.substr(start, close -
258     start), url));
259 douglas 1
260 Douglas Thrift 355 if (!link.empty()) links.insert(link);
261 douglas 1 }
262     }
263     }
264    
265     if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
266     0) || (lowerTag.find("img ")) && inBody)
267     {
268     if (lowerTag.find("alt=\"") != string::npos)
269     {
270 douglas 365 size_t start(lowerTag.find("alt=\"") + 5),
271 Douglas Thrift 348 finish(lowerTag.find('\"', start));
272 douglas 1
273     text += tag.substr(start, finish - start) + ' ';
274 Douglas Thrift 348
275 douglas 1 if (inHeading) heading += tag.substr(start, finish -
276     start) + ' ';
277     }
278     else if (lowerTag.find("alt=") != string::npos)
279     {
280 douglas 365 size_t start(lowerTag.find("alt=") + 4),
281 Douglas Thrift 348 finish(lowerTag.find(' ', start));
282 douglas 1
283     if (finish < close)
284     {
285     text += tag.substr(start, finish - start) + ' ';
286 Douglas Thrift 348
287 douglas 1 if (inHeading) heading += tag.substr(start, finish
288     - start) + ' ';
289     }
290     else
291     {
292     text += tag.substr(start, close - start) + ' ';
293 Douglas Thrift 348
294 douglas 1 if (inHeading) heading += tag.substr(start, close -
295     start) + ' ';
296     }
297     }
298     }
299     }
300    
301     if (lowerTag.find("html") == 0) inHtml = true;
302     if (lowerTag.find("/html") == 0) inHtml = false;
303     if (lowerTag.find("head") == 0) inHead = true;
304     if (lowerTag.find("/head") == 0) inHead = false;
305     if (lowerTag.find("title") == 0) inTitle = true;
306     if (lowerTag.find("/title") == 0) inTitle = false;
307     if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
308     inBody = true;
309     if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
310     inBody = false;
311    
312     if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
313     lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
314     lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
315 douglas 315 {
316 Douglas Thrift 355 heading.erase();
317    
318 douglas 1 inHeading = true;
319 douglas 315 }
320 douglas 316
321 douglas 1 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
322     lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
323     lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
324     {
325 Douglas Thrift 355 if (!heading.empty()) headings.push_back(heading);
326 douglas 316
327 douglas 1 inHeading = false;
328     }
329    
330     if (lowerTag.find("!--") == 0)
331     {
332     startComment = open;
333     inComment = true;
334     }
335 douglas 316
336 douglas 1 if (line.find("-->", begin) >= startComment && line.find("-->",
337     begin) != string::npos)
338     {
339 Douglas Thrift 360 close = line.find("-->", begin) + 2;
340 douglas 1 inComment = false;
341     }
342    
343     if (close == string::npos)
344     {
345     begin = close;
346     }
347     else
348     {
349     begin = close + 1;
350     }
351     }
352     }
353    
354     return answer;
355     }
356    
357 douglas 365 string Processor::getTag(const string& line, size_t open, size_t close)
358 douglas 1 {
359 Douglas Thrift 348 return line.substr(open + 1, close - open - 1);
360 douglas 1 }

Properties

Name Value
svn:eol-style native
svn:keywords Id