ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Processor.cpp
Revision: 334
Committed: 2004-04-05T16:37:41-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 10138 byte(s)
Log Message:
Ah, I just love Subversion!

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Processor
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Processor.hpp"
52
53 Processor::Processor()
54 {
55 page = new Page();
56 }
57
58 Processor::~Processor()
59 {
60 delete page;
61 }
62
63 bool Processor::process(HttpHandler& http, URL& url)
64 {
65 string title, description, text;
66 vector<string> headings;
67
68 if (http.contentType().find("text/html") == 0)
69 {
70 if (!process(http, url, title, description, text, headings)) return
71 false;
72
73 entities(title, "&nbsp;", ' ');
74 entities(title, "&lt;", '<');
75 entities(title, "&gt;", '>');
76 entities(title, "&quot;", '\"');
77 entities(title, "&amp;", '&');
78 entities(description, "&nbsp;", ' ');
79 entities(description, "&lt;", '<');
80 entities(description, "&gt;", '>');
81 entities(description, "&quot;", '\"');
82 entities(description, "&amp;", '&');
83 entities(text, "&nbsp;", ' ');
84 entities(text, "&lt;", '<');
85 entities(text, "&gt;", '>');
86 entities(text, "&quot;", '\"');
87 entities(text, "&amp;", '&');
88
89 for (int index = 0; index < headings.size(); index++)
90 {
91 entities(headings[index], "&nbsp;", ' ');
92 entities(headings[index], "&lt;", '<');
93 entities(headings[index], "&gt;", '>');
94 entities(headings[index], "&quot;", '\"');
95 entities(headings[index], "&amp;", '&');
96 }
97
98 normalize(title);
99 normalize(description);
100 normalize(text);
101
102 for (int index0 = 0; index0 < headings.size(); index0++)
103 {
104 normalize(headings[index0]);
105 }
106 }
107 else
108 {
109 string line;
110
111 while (http.good())
112 {
113 http.getline(line);
114
115 text += line + "\n";
116 }
117
118 normalize(text);
119 }
120
121 page->setSize(http.contentLength());
122 page->setURL(url);
123 page->setTitle(title);
124 page->setDescription(description);
125 page->setText(text);
126 page->setHeadings(headings);
127
128 return true;
129 }
130
131 void Processor::reset()
132 {
133 links.clear();
134
135 delete page;
136
137 page = new Page();
138 }
139
140 bool Processor::process(HttpHandler& http, URL& url, string& title, string&
141 description, string& text, vector<string>& headings)
142 {
143 bool inHtml = false, inHead = false, inTitle = false, inBody = false,
144 inHeading = false, inComment = false, follow = true, answer = true;
145 string line, heading;
146
147 while (http.good())
148 {
149 http.getline(line);
150
151 unsigned begin = 0, startComment = 0;
152
153 while (begin < line.length())
154 {
155 unsigned open = line.find('<', begin);
156 unsigned close = line.find('>', begin);
157 string next;
158
159 while (close == string::npos && http.good())
160 {
161 http.getline(next);
162 line += '\n' + next;
163 close = line.find('>', begin);
164 }
165
166 // strangely this is necessary sometimes
167 if (open == string::npos) open = line.find('<', begin);
168
169 string between = line.substr(begin, open - begin);
170 string tag = getTag(line, open, close);
171 string lowerTag(tag.length(), ' ');
172
173 for (unsigned index = 0; index < tag.length(); index++)
174 {
175 lowerTag[index] = tolower(tag[index]);
176 }
177
178 if (inHtml && !inComment)
179 {
180 if (inHead && inTitle)
181 {
182 title += between + "\n";
183 }
184
185 if (inBody)
186 {
187 text += between + "\n";
188 }
189
190 if (inBody && inHeading)
191 {
192 heading += between + "\n";
193 }
194
195 if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
196 == 0) || (lowerTag.find("meta ") == 0)) && inHead)
197 {
198 if (lowerTag.find("name=robots") != string::npos ||
199 lowerTag.find("name=\"robots\"") != string::npos)
200 {
201 unsigned start = lowerTag.find("content=\"") + 9;
202 unsigned finish = lowerTag.find('\"', start);
203
204 string robots = lowerTag.substr(start, finish - start);
205
206 if ((robots.find("noindex") != string::npos &&
207 robots.find("nofollow") != string::npos) ||
208 robots.find("none") != string::npos)
209 {
210 answer = false;
211 follow = false;
212 links.clear();
213
214 return answer;
215 }
216 else if (robots.find("noindex") != string::npos)
217 {
218 answer = false;
219 }
220 else if (robots.find("nofollow") != string::npos)
221 {
222 follow = false;
223 links.clear();
224 }
225 }
226 else if (lowerTag.find("name=description") != string::npos
227 || lowerTag.find("name=\"description\"") !=
228 string::npos)
229 {
230 unsigned start = lowerTag.find("content=\"") + 9;
231 unsigned finish = lowerTag.find('\"', start);
232
233 description = tag.substr(start, finish - start);
234 }
235 }
236
237 if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
238 || (lowerTag.find("a ") == 0)) && inBody && follow)
239 {
240 if (lowerTag.find("href=\"") != string::npos)
241 {
242 unsigned start = lowerTag.find("href=\"") + 6;
243 unsigned finish = lowerTag.find('\"', start);
244
245 string link = getLink(tag.substr(start, finish -
246 start), url);
247
248 if (link != "") links.insert(link);
249 }
250 else if (lowerTag.find("href=") != string::npos)
251 {
252 unsigned start = lowerTag.find("href=") + 5;
253 unsigned finish = lowerTag.find(' ', start);
254
255 if (finish < close)
256 {
257 string link = getLink(tag.substr(start, finish -
258 start), url);
259
260 if (link != "") links.insert(link);
261 }
262 else
263 {
264 string link = getLink(tag.substr(start, close -
265 start), url);
266
267 if (link != "") links.insert(link);
268 }
269 }
270 }
271
272 if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
273 0) || (lowerTag.find("img ")) && inBody)
274 {
275 if (lowerTag.find("alt=\"") != string::npos)
276 {
277 unsigned start = lowerTag.find("alt=\"") + 5;
278 unsigned finish = lowerTag.find('\"', start);
279
280 text += tag.substr(start, finish - start) + ' ';
281 if (inHeading) heading += tag.substr(start, finish -
282 start) + ' ';
283 }
284 else if (lowerTag.find("alt=") != string::npos)
285 {
286 unsigned start = lowerTag.find("alt=") + 4;
287 unsigned finish = lowerTag.find(' ', start);
288
289 if (finish < close)
290 {
291 text += tag.substr(start, finish - start) + ' ';
292 if (inHeading) heading += tag.substr(start, finish
293 - start) + ' ';
294 }
295 else
296 {
297 text += tag.substr(start, close - start) + ' ';
298 if (inHeading) heading += tag.substr(start, close -
299 start) + ' ';
300 }
301 }
302 }
303 }
304
305 if (lowerTag.find("html") == 0) inHtml = true;
306 if (lowerTag.find("/html") == 0) inHtml = false;
307 if (lowerTag.find("head") == 0) inHead = true;
308 if (lowerTag.find("/head") == 0) inHead = false;
309 if (lowerTag.find("title") == 0) inTitle = true;
310 if (lowerTag.find("/title") == 0) inTitle = false;
311 if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
312 inBody = true;
313 if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
314 inBody = false;
315
316 if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
317 lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
318 lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
319 {
320 heading = "";
321 inHeading = true;
322 }
323
324 if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
325 lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
326 lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
327 {
328 if (heading != "") headings.push_back(heading);
329
330 inHeading = false;
331 }
332
333 if (lowerTag.find("!--") == 0)
334 {
335 startComment = open;
336 inComment = true;
337 }
338
339 if (line.find("-->", begin) >= startComment && line.find("-->",
340 begin) != string::npos)
341 {
342 close = line.find("-->", begin) + 3;
343 inComment = false;
344 }
345
346 if (close == string::npos)
347 {
348 begin = close;
349 }
350 else
351 {
352 begin = close + 1;
353 }
354 }
355 }
356
357 return answer;
358 }
359
360 string Processor::getTag(const string& line, unsigned open, unsigned close)
361 {
362 string tag = line.substr(open + 1, close - open - 1);
363
364 return tag;
365 }

Properties

Name Value
svn:eol-style native
svn:keywords Id