ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 360
Committed: 2004-08-19T20:09:52-07:00 (20 years, 10 months ago) by Douglas Thrift
File size: 9803 byte(s)
Log Message:
Did a lot of stuff, including fixing a bug.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Processor
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Processor.hpp"
52
53 bool Processor::process(HttpHandler& http, URL& url)
54 {
55 string title, description, text;
56 vector<string> headings;
57
58 if (http.contentType().find("text/html") == 0)
59 {
60 if (!process(http, url, title, description, text, headings)) return
61 false;
62
63 entities(title, "&nbsp;", ' ');
64 entities(title, "&lt;", '<');
65 entities(title, "&gt;", '>');
66 entities(title, "&quot;", '\"');
67 entities(title, "&amp;", '&');
68 entities(description, "&nbsp;", ' ');
69 entities(description, "&lt;", '<');
70 entities(description, "&gt;", '>');
71 entities(description, "&quot;", '\"');
72 entities(description, "&amp;", '&');
73 entities(text, "&nbsp;", ' ');
74 entities(text, "&lt;", '<');
75 entities(text, "&gt;", '>');
76 entities(text, "&quot;", '\"');
77 entities(text, "&amp;", '&');
78
79 for (size_t index(0); index < headings.size(); index++)
80 {
81 entities(headings[index], "&nbsp;", ' ');
82 entities(headings[index], "&lt;", '<');
83 entities(headings[index], "&gt;", '>');
84 entities(headings[index], "&quot;", '\"');
85 entities(headings[index], "&amp;", '&');
86 }
87
88 normalize(title);
89 normalize(description);
90 normalize(text);
91
92 for (size_t index0(0); index0 < headings.size(); index0++)
93 {
94 normalize(headings[index0]);
95 }
96 }
97 else
98 {
99 string line;
100
101 while (http.good())
102 {
103 http.getline(line);
104
105 text += line + "\n";
106 }
107
108 normalize(text);
109 }
110
111 page.setSize(http.contentLength());
112 page.setURL(url);
113 page.setTitle(title);
114 page.setDescription(description);
115 page.setText(text);
116 page.setHeadings(headings);
117
118 return true;
119 }
120
121 void Processor::reset()
122 {
123 links.clear();
124
125 page = Page();
126 }
127
128 bool Processor::process(HttpHandler& http, URL& url, string& title, string&
129 description, string& text, vector<string>& headings)
130 {
131 bool inHtml(false), inHead(false), inTitle(false), inBody(false),
132 inHeading(false), inComment(false), follow(true), answer(true);
133 string line, heading;
134
135 while (http.good())
136 {
137 http.getline(line);
138
139 unsigned begin(0), startComment(0);
140
141 while (begin < line.length())
142 {
143 unsigned open(line.find('<', begin)), close(line.find('>', begin));
144 string next;
145
146 while (close == string::npos && http.good())
147 {
148 http.getline(next);
149
150 line += '\n' + next;
151
152 close = line.find('>', begin);
153 }
154
155 // strangely this is necessary sometimes
156 if (open == string::npos) open = line.find('<', begin);
157
158 string between(line.substr(begin, open - begin)), tag(getTag(line,
159 open, close)), lowerTag(tolower(tag));
160
161 if (inHtml && !inComment)
162 {
163 if (inHead && inTitle)
164 {
165 title += between + '\n';
166 }
167
168 if (inBody)
169 {
170 text += between + '\n';
171 }
172
173 if (inBody && inHeading)
174 {
175 heading += between + '\n';
176 }
177
178 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
179 == 0) || (lowerTag.find("meta ") == 0)) && inHead)
180 {
181 if (lowerTag.find("name=robots") != string::npos ||
182 lowerTag.find("name=\"robots\"") != string::npos)
183 {
184 unsigned start(lowerTag.find("content=\"") + 9),
185 finish(lowerTag.find('\"', start));
186 string robots(lowerTag.substr(start, finish - start));
187
188 if ((robots.find("noindex") != string::npos &&
189 robots.find("nofollow") != string::npos) ||
190 robots.find("none") != string::npos)
191 {
192 answer = false;
193 follow = false;
194
195 links.clear();
196
197 return answer;
198 }
199 else if (robots.find("noindex") != string::npos)
200 {
201 answer = false;
202 }
203 else if (robots.find("nofollow") != string::npos)
204 {
205 follow = false;
206
207 links.clear();
208 }
209 }
210 else if (lowerTag.find("name=description") != string::npos
211 || lowerTag.find("name=\"description\"") !=
212 string::npos)
213 {
214 unsigned start(lowerTag.find("content=\"") + 9),
215 finish(lowerTag.find('\"', start));
216
217 description = tag.substr(start, finish - start);
218 }
219 }
220
221 if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
222 || (lowerTag.find("a ") == 0)) && inBody && follow)
223 {
224 if (lowerTag.find("href=\"") != string::npos)
225 {
226 unsigned start(lowerTag.find("href=\"") + 6),
227 finish(lowerTag.find('\"', start));
228 string link(getLink(tag.substr(start, finish -
229 start), url));
230
231 if (!link.empty()) links.insert(link);
232 }
233 else if (lowerTag.find("href=") != string::npos)
234 {
235 unsigned start(lowerTag.find("href=") + 5),
236 finish(lowerTag.find(' ', start));
237
238 if (finish < close)
239 {
240 string link(getLink(tag.substr(start, finish -
241 start), url));
242
243 if (!link.empty()) links.insert(link);
244 }
245 else
246 {
247 string link(getLink(tag.substr(start, close -
248 start), url));
249
250 if (!link.empty()) links.insert(link);
251 }
252 }
253 }
254
255 if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
256 0) || (lowerTag.find("img ")) && inBody)
257 {
258 if (lowerTag.find("alt=\"") != string::npos)
259 {
260 unsigned start(lowerTag.find("alt=\"") + 5),
261 finish(lowerTag.find('\"', start));
262
263 text += tag.substr(start, finish - start) + ' ';
264
265 if (inHeading) heading += tag.substr(start, finish -
266 start) + ' ';
267 }
268 else if (lowerTag.find("alt=") != string::npos)
269 {
270 unsigned start(lowerTag.find("alt=") + 4),
271 finish(lowerTag.find(' ', start));
272
273 if (finish < close)
274 {
275 text += tag.substr(start, finish - start) + ' ';
276
277 if (inHeading) heading += tag.substr(start, finish
278 - start) + ' ';
279 }
280 else
281 {
282 text += tag.substr(start, close - start) + ' ';
283
284 if (inHeading) heading += tag.substr(start, close -
285 start) + ' ';
286 }
287 }
288 }
289 }
290
291 if (lowerTag.find("html") == 0) inHtml = true;
292 if (lowerTag.find("/html") == 0) inHtml = false;
293 if (lowerTag.find("head") == 0) inHead = true;
294 if (lowerTag.find("/head") == 0) inHead = false;
295 if (lowerTag.find("title") == 0) inTitle = true;
296 if (lowerTag.find("/title") == 0) inTitle = false;
297 if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
298 inBody = true;
299 if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
300 inBody = false;
301
302 if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
303 lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
304 lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
305 {
306 heading.erase();
307
308 inHeading = true;
309 }
310
311 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
312 lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
313 lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
314 {
315 if (!heading.empty()) headings.push_back(heading);
316
317 inHeading = false;
318 }
319
320 if (lowerTag.find("!--") == 0)
321 {
322 startComment = open;
323 inComment = true;
324 }
325
326 if (line.find("-->", begin) >= startComment && line.find("-->",
327 begin) != string::npos)
328 {
329 close = line.find("-->", begin) + 2;
330 inComment = false;
331 }
332
333 if (close == string::npos)
334 {
335 begin = close;
336 }
337 else
338 {
339 begin = close + 1;
340 }
341 }
342 }
343
344 return answer;
345 }
346
347 string Processor::getTag(const string& line, unsigned open, unsigned close)
348 {
349 return line.substr(open + 1, close - open - 1);
350 }

Properties

Name Value
svn:eol-style native
svn:keywords Id