ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/Search/trunk/Processor.cpp
Revision: 372
Committed: 2008-08-23T04:00:12-07:00 (16 years, 9 months ago) by douglas
File size: 10182 byte(s)
Log Message:
Update copyright dates.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Processor
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Processor.hpp"
52
53 bool Processor::process(HttpHandler& http, URL& url)
54 {
55 string title, description, text;
56 vector<string> headings;
57
58 if (http.contentType().find("text/html") == 0)
59 {
60 if (!process(http, url, title, description, text, headings)) return
61 false;
62
63 entities(title, "&nbsp;", ' ');
64 entities(title, "&lt;", '<');
65 entities(title, "&gt;", '>');
66 entities(title, "&quot;", '\"');
67 entities(title, "&amp;", '&');
68 entities(description, "&nbsp;", ' ');
69 entities(description, "&lt;", '<');
70 entities(description, "&gt;", '>');
71 entities(description, "&quot;", '\"');
72 entities(description, "&amp;", '&');
73 entities(text, "&nbsp;", ' ');
74 entities(text, "&lt;", '<');
75 entities(text, "&gt;", '>');
76 entities(text, "&quot;", '\"');
77 entities(text, "&amp;", '&');
78
79 for (size_t index(0); index < headings.size(); index++)
80 {
81 entities(headings[index], "&nbsp;", ' ');
82 entities(headings[index], "&lt;", '<');
83 entities(headings[index], "&gt;", '>');
84 entities(headings[index], "&quot;", '\"');
85 entities(headings[index], "&amp;", '&');
86 }
87
88 normalize(title);
89 normalize(description);
90 normalize(text);
91
92 for (size_t index0(0); index0 < headings.size(); index0++)
93 {
94 normalize(headings[index0]);
95 }
96 }
97 else
98 {
99 string line;
100
101 while (http.good())
102 {
103 http.getline(line);
104
105 text += line + "\n";
106 }
107
108 normalize(text);
109 }
110
111 page.setSize(http.contentLength());
112 page.setURL(url);
113 page.setTitle(title);
114 page.setDescription(description);
115 page.setText(text);
116 page.setHeadings(headings);
117
118 return true;
119 }
120
121 void Processor::reset()
122 {
123 links.clear();
124
125 page = Page();
126 }
127
128 bool Processor::process(HttpHandler& http, URL& url, string& title, string&
129 description, string& text, vector<string>& headings)
130 {
131 bool inHtml(false), inHead(false), inTitle(false), inBody(false),
132 inHeading(false), inComment(false), follow(true), answer(true);
133 string line, heading;
134
135 while (http.good())
136 {
137 http.getline(line);
138
139 size_t begin(0), startComment(0);
140
141 while (begin < line.length())
142 {
143 size_t open(line.find('<', begin)), close(line.find('>', begin));
144 string next;
145
146 while (close == string::npos && http.good())
147 {
148 http.getline(next);
149
150 line += '\n' + next;
151
152 close = line.find('>', begin);
153 }
154
155 // strangely this is necessary sometimes
156 if (open == string::npos) open = line.find('<', begin);
157
158 string between(line.substr(begin, open - begin)), tag(getTag(line,
159 open, close)), lowerTag(tolower(tag));
160
161 if (inHtml && !inComment)
162 {
163 if (inHead && inTitle)
164 {
165 title += between + '\n';
166 }
167
168 if (inBody)
169 {
170 text += between + '\n';
171 }
172
173 if (inBody && inHeading)
174 {
175 heading += between + '\n';
176 }
177
178 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
179 == 0) || (lowerTag.find("meta ") == 0)) && inHead)
180 {
181 if (lowerTag.find("name=robots") != string::npos ||
182 lowerTag.find("name=\"robots\"") != string::npos ||
183 lowerTag.find("name='robots'") != string::npos)
184 {
185 size_t start(lowerTag.find("content=\"") + 9),
186 finish(lowerTag.find('\"', start));
187 string robots(lowerTag.substr(start, finish - start));
188
189 if ((robots.find("noindex") != string::npos &&
190 robots.find("nofollow") != string::npos) ||
191 robots.find("none") != string::npos)
192 {
193 answer = false;
194 follow = false;
195
196 links.clear();
197
198 return answer;
199 }
200 else if (robots.find("noindex") != string::npos)
201 {
202 answer = false;
203 }
204 else if (robots.find("nofollow") != string::npos)
205 {
206 follow = false;
207
208 links.clear();
209 }
210 }
211 else if (lowerTag.find("name=description") != string::npos
212 || lowerTag.find("name=\"description\"") != string::npos
213 || lowerTag.find("name='description'") != string::npos)
214 {
215 size_t start(lowerTag.find("content=\"") + 9),
216 finish(lowerTag.find('\"', start));
217
218 description = tag.substr(start, finish - start);
219 }
220 }
221
222 if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
223 || (lowerTag.find("a ") == 0)) && inBody && follow)
224 {
225 if (lowerTag.find("href=\"") != string::npos)
226 {
227 size_t start(lowerTag.find("href=\"") + 6),
228 finish(lowerTag.find('\"', start));
229 string link(getLink(tag.substr(start, finish -
230 start), url));
231
232 if (!link.empty()) links.insert(link);
233 }
234 else if (lowerTag.find("href='") != string::npos)
235 {
236 size_t start(lowerTag.find("href='") + 6),
237 finish(lowerTag.find('\'', start));
238 string link(getLink(tag.substr(start, finish -
239 start), url));
240
241 if (!link.empty()) links.insert(link);
242 }
243 else if (lowerTag.find("href=") != string::npos)
244 {
245 size_t start(lowerTag.find("href=") + 5),
246 finish(lowerTag.find(' ', start));
247
248 if (finish < close)
249 {
250 string link(getLink(tag.substr(start, finish -
251 start), url));
252
253 if (!link.empty()) links.insert(link);
254 }
255 else
256 {
257 string link(getLink(tag.substr(start, close -
258 start), url));
259
260 if (!link.empty()) links.insert(link);
261 }
262 }
263 }
264
265 if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
266 0) || (lowerTag.find("img ")) && inBody)
267 {
268 if (lowerTag.find("alt=\"") != string::npos)
269 {
270 size_t start(lowerTag.find("alt=\"") + 5),
271 finish(lowerTag.find('\"', start));
272
273 text += tag.substr(start, finish - start) + ' ';
274
275 if (inHeading) heading += tag.substr(start, finish -
276 start) + ' ';
277 }
278 else if (lowerTag.find("alt=") != string::npos)
279 {
280 size_t start(lowerTag.find("alt=") + 4),
281 finish(lowerTag.find(' ', start));
282
283 if (finish < close)
284 {
285 text += tag.substr(start, finish - start) + ' ';
286
287 if (inHeading) heading += tag.substr(start, finish
288 - start) + ' ';
289 }
290 else
291 {
292 text += tag.substr(start, close - start) + ' ';
293
294 if (inHeading) heading += tag.substr(start, close -
295 start) + ' ';
296 }
297 }
298 }
299 }
300
301 if (lowerTag.find("html") == 0) inHtml = true;
302 if (lowerTag.find("/html") == 0) inHtml = false;
303 if (lowerTag.find("head") == 0) inHead = true;
304 if (lowerTag.find("/head") == 0) inHead = false;
305 if (lowerTag.find("title") == 0) inTitle = true;
306 if (lowerTag.find("/title") == 0) inTitle = false;
307 if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
308 inBody = true;
309 if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
310 inBody = false;
311
312 if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
313 lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
314 lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
315 {
316 heading.erase();
317
318 inHeading = true;
319 }
320
321 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
322 lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
323 lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
324 {
325 if (!heading.empty()) headings.push_back(heading);
326
327 inHeading = false;
328 }
329
330 if (lowerTag.find("!--") == 0)
331 {
332 startComment = open;
333 inComment = true;
334 }
335
336 if (line.find("-->", begin) >= startComment && line.find("-->",
337 begin) != string::npos)
338 {
339 close = line.find("-->", begin) + 2;
340 inComment = false;
341 }
342
343 if (close == string::npos)
344 {
345 begin = close;
346 }
347 else
348 {
349 begin = close + 1;
350 }
351 }
352 }
353
354 return answer;
355 }
356
357 string Processor::getTag(const string& line, size_t open, size_t close)
358 {
359 return line.substr(open + 1, close - open - 1);
360 }

Properties

Name Value
svn:eol-style native
svn:keywords Id