ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 25
Committed: 2002-12-22T23:32:58-08:00 (22 years, 6 months ago) by douglas
File size: 7516 byte(s)
Log Message:
Added "referer" handling to Indexer and HttpHandler.handle().

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // Indexer.cpp
50
51 #include "Indexer.h"
52
53 Indexer::Indexer(string& indexFile, set<string>& domains,
54 set<string>& restrictions)
55 {
56 this->indexFile = indexFile;
57 this->domains = domains;
58 this->restrictions = restrictions;
59 }
60
61 void Indexer::index(string& begin)
62 {
63 ofstream fout(indexFile.c_str());
64
65 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
66 << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
67 << "<index>\n";
68
69 URL first(begin);
70
71 index(first, fout);
72
73 fout << "</index>\n";
74
75 fout.close();
76 }
77
78 void Indexer::index(URL& url, ofstream& fout, const string referer)
79 {
80 if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
81 url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
82 pages.end())
83 {
84 if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
85 url.getPort() : "") == checked.end())
86 {
87 robots(url);
88 }
89
90 if (!restricted(url))
91 {
92 if (http.handle(url, referer, true))
93 {
94 if (http.contentType().find("text/plain") == 0 ||
95 http.contentType().find("text/html") == 0)
96 {
97 http.clear();
98 if (!http.handle(url, referer)) exit(1);
99
100 cout << "Indexing " << url << "..." << flush;
101
102 if (processor.process(http, url))
103 {
104 Page page = processor.getPage();
105 fout << page << "\n";
106
107 cout << "done.\n";
108 }
109 else
110 {
111 cout << "canceled.\n";
112 }
113
114 pages.insert(url.getURL());
115 Set pageLinks = processor.getLinks();
116 processor.reset();
117
118 for (SetIterator link = pageLinks.begin(); link !=
119 pageLinks.end(); link++)
120 {
121 if (pages.find(*link) == pages.end())
122 {
123 links.push(URL(*link));
124 referers.push(url.getURL());
125 }
126 }
127 }
128 else
129 {
130 // unhandled content
131 }
132 }
133 else if (http.redirect() != "")
134 {
135 if (pages.find(http.redirect()) == pages.end())
136 {
137 links.push(URL(http.redirect()));
138 referers.push(url.getURL());
139 }
140 }
141
142 http.clear();
143 }
144 }
145
146 if (!links.empty())
147 {
148 URL next = links.front();
149 links.pop();
150
151 string referer = referers.front();
152 referers.pop();
153
154 if (debug) cerr << "next = " << next << "\n";
155
156 index(next, fout, referer);
157 }
158 }
159
160 bool Indexer::restricted(URL& url)
161 {
162 bool answer = false;
163
164 for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
165 itor++)
166 {
167 URL checker = *itor;
168
169 if (url.getAddress() == checker.getAddress() && url.getPort() ==
170 checker.getPort())
171 {
172 if (url.getPath().find(checker.getPath()) == 0)
173 {
174 answer = true;
175 break;
176 }
177 }
178 }
179
180 return answer;
181 }
182
183 void Indexer::robots(URL& url)
184 {
185 URL robots = url;
186 robots.setPath("/robots.txt");
187
188 if (http.handle(robots))
189 {
190 cout << "Checking " << robots << "..." << flush;
191
192 string line;
193
194 bool record = false, hasVersion = false, hasName = false, hasAll =
195 false;
196 robot state = none;
197 Set restrictionsVersion, restrictionsName, restrictionsAll;
198
199 while (http.good())
200 {
201 http.getline(line);
202
203 unsigned comment = line.find('#');
204 if (comment != string::npos) line.erase(comment);
205
206 if (line == "" && comment == string::npos) record = false;
207 if (line == "") continue;
208
209 unsigned colon = line.find(':');
210
211 string field = line.substr(0, colon);
212 string value = line.substr(colon + 1);
213
214 normalize(value);
215
216 if (field == "User-agent" && value == agent(true))
217 {
218 state = version;
219 record = true;
220 hasVersion = true;
221 }
222 else if (field == "User-agent" && value == agent(false))
223 {
224 state = name;
225 record = true;
226 hasName = true;
227 }
228 else if (field == "User-agent" && value == "*")
229 {
230 state = all;
231 record = true;
232 hasAll = true;
233 }
234 else if (field == "Disallow" && record && value == "")
235 {
236 // no restrictions
237 }
238 else if (field == "Disallow" && record)
239 {
240 URL restriction = robots;
241 restriction.setPath(value);
242
243 switch (state)
244 {
245 case version:
246 restrictionsVersion.insert(restriction.getURL());
247 break;
248 case name:
249 restrictionsName.insert(restriction.getURL());
250 break;
251 case all:
252 restrictionsAll.insert(restriction.getURL());
253 break;
254 default:
255 break;
256 }
257 }
258 }
259
260 if (hasVersion)
261 {
262 state = version;
263 }
264 else if (hasName)
265 {
266 state = name;
267 }
268 else if (hasAll)
269 {
270 state = all;
271 }
272 else
273 {
274 state = none;
275 }
276
277 SetIterator itor;
278 switch (state)
279 {
280 case version:
281 for (itor = restrictionsVersion.begin(); itor !=
282 restrictionsVersion.end(); itor++)
283 {
284 restrictions.insert(*itor);
285 }
286 break;
287 case name:
288 for (itor = restrictionsName.begin(); itor !=
289 restrictionsName.end(); itor++)
290 {
291 restrictions.insert(*itor);
292 }
293 break;
294 case all:
295 for (itor = restrictionsAll.begin(); itor !=
296 restrictionsAll.end(); itor++)
297 {
298 restrictions.insert(*itor);
299 }
300 break;
301 default:
302 break;
303 }
304
305 cout << "done.\n";
306 }
307
308 http.clear();
309
310 checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
311 url.getPort() : "");
312 }