ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 199
Committed: 2003-07-15T00:22:06-07:00 (21 years, 11 months ago) by douglas
File size: 8575 byte(s)
Log Message:
Did more OpenSSL stuff, like the version output.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // $Id: Indexer.cpp,v 1.13 2003/07/15 07:22:06 douglas Exp $
50
51 #include "Indexer.h"
52
53 #ifndef _WIN32
54 #include <unistd.h>
55 #else // _WIN32
56 inline int unlink(const char* filename) { return DeleteFile(filename); }
57 #endif // _WIN32
58
59 Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
60 restrictions)
61 {
62 this->indexFile = indexFile;
63 this->domains = domains;
64 this->restrictions = restrictions;
65 }
66
67 void Indexer::index(string& begin)
68 {
69 unsigned separator = indexFile.rfind(slash);
70 string dtd = separator != string::npos ? indexFile.substr(0, separator) +
71 slash + "index.dtd" : "index.dtd";
72
73 ifstream fin(dtd.c_str());
74
75 if (!fin.is_open())
76 {
77 ofstream fout(dtd.c_str());
78
79 fout << "<!ELEMENT index (page*)>\n"
80 << "<!ELEMENT page (address, port?, tls?, path, title?, descriptio"
81 << "n?, keywords?, text, heading*)>\n"
82 << "<!ELEMENT address (#PCDATA)>\n"
83 << "<!ELEMENT port (#PCDATA)>\n"
84 << "<!ELEMENT tls (#PCDATA)>\n"
85 << "<!ELEMENT path (#PCDATA)>\n"
86 << "<!ELEMENT size (#PCDATA)>\n"
87 << "<!ELEMENT title (#PCDATA)>\n"
88 << "<!ELEMENT description (#PCDATA)>\n"
89 << "<!ELEMENT text (#PCDATA)>\n"
90 << "<!ELEMENT heading (#PCDATA)>\n";
91
92 fout.close();
93 }
94
95 fin.close();
96
97 string lock = indexFile + ".lock";
98
99 ofstream fout(lock.c_str());
100 fout.close();
101 fout.open(indexFile.c_str());
102
103 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
104 << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
105 << "<index>\n";
106
107 URL first(begin);
108
109 index(first, fout);
110
111 fout << "</index>\n";
112
113 fout.close();
114
115 unlink(lock.c_str());
116 }
117
118 void Indexer::index(URL& url, ofstream& fout, const string referer)
119 {
120 if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
121 url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
122 pages.end())
123 {
124 if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
125 url.getPort() : "") == checked.end())
126 {
127 robots(url);
128 }
129
130 if (!restricted(url))
131 {
132 if (http.handle(url, referer, true))
133 {
134 if (http.contentType().find("text/plain") == 0 ||
135 http.contentType().find("text/html") == 0)
136 {
137 http.clear();
138 if (!http.handle(url, referer)) exit(1);
139
140 cout << "Indexing " << url << "..." << flush;
141
142 if (processor.process(http, url))
143 {
144 Page page = processor.getPage();
145 fout << page << "\n";
146
147 cout << "done.\n";
148 }
149 else
150 {
151 cout << "canceled.\n";
152 }
153
154 pages.insert(url.getURL());
155 Set pageLinks = processor.getLinks();
156 processor.reset();
157
158 for (SetIterator link = pageLinks.begin(); link !=
159 pageLinks.end(); link++)
160 {
161 if (pages.find(*link) == pages.end())
162 {
163 links.push(URL(*link));
164 referers.push(url.getURL());
165 }
166 }
167 }
168 else
169 {
170 // unhandled content
171 }
172 }
173 else if (http.redirect() != "")
174 {
175 if (pages.find(http.redirect()) == pages.end())
176 {
177 links.push(URL(http.redirect()));
178 referers.push(url.getURL());
179 }
180 }
181
182 http.clear();
183 }
184 }
185
186 if (!links.empty())
187 {
188 URL next = links.front();
189 links.pop();
190
191 string referer = referers.front();
192 referers.pop();
193
194 if (debug) cerr << "next = " << next << "\n";
195
196 index(next, fout, referer);
197 }
198 }
199
200 bool Indexer::restricted(URL& url)
201 {
202 bool answer = false;
203
204 for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
205 itor++)
206 {
207 URL checker = *itor;
208
209 if (url.getAddress() == checker.getAddress() && url.getPort() ==
210 checker.getPort())
211 {
212 if (url.getPath().find(checker.getPath()) == 0)
213 {
214 answer = true;
215 break;
216 }
217 }
218 }
219
220 return answer;
221 }
222
223 void Indexer::robots(URL& url)
224 {
225 URL robots = url;
226 robots.setPath("/robots.txt");
227
228 if (http.handle(robots))
229 {
230 cout << "Checking " << robots << "..." << flush;
231
232 string line;
233
234 bool record = false, hasVersion = false, hasName = false, hasAll =
235 false;
236 robot state = none;
237 Set restrictionsVersion, restrictionsName, restrictionsAll;
238
239 while (http.good())
240 {
241 http.getline(line);
242
243 unsigned comment = line.find('#');
244 if (comment != string::npos) line.erase(comment);
245
246 if (line == "" && comment == string::npos) record = false;
247 if (line == "") continue;
248
249 unsigned colon = line.find(':');
250
251 string field = line.substr(0, colon);
252 string value = line.substr(colon + 1);
253
254 normalize(value);
255
256 if (field == "User-agent" && value == agent(true))
257 {
258 state = version;
259 record = true;
260 hasVersion = true;
261 }
262 else if (field == "User-agent" && value == agent(false))
263 {
264 state = name;
265 record = true;
266 hasName = true;
267 }
268 else if (field == "User-agent" && value == "*")
269 {
270 state = all;
271 record = true;
272 hasAll = true;
273 }
274 else if (field == "Disallow" && record && value == "")
275 {
276 // no restrictions
277 }
278 else if (field == "Disallow" && record)
279 {
280 URL restriction = robots;
281 restriction.setPath(value);
282
283 switch (state)
284 {
285 case version:
286 restrictionsVersion.insert(restriction.getURL());
287 break;
288 case name:
289 restrictionsName.insert(restriction.getURL());
290 break;
291 case all:
292 restrictionsAll.insert(restriction.getURL());
293 break;
294 default:
295 break;
296 }
297 }
298 }
299
300 if (hasVersion)
301 {
302 state = version;
303 }
304 else if (hasName)
305 {
306 state = name;
307 }
308 else if (hasAll)
309 {
310 state = all;
311 }
312 else
313 {
314 state = none;
315 }
316
317 SetIterator itor;
318 switch (state)
319 {
320 case version:
321 for (itor = restrictionsVersion.begin(); itor !=
322 restrictionsVersion.end(); itor++)
323 {
324 restrictions.insert(*itor);
325 }
326 break;
327 case name:
328 for (itor = restrictionsName.begin(); itor !=
329 restrictionsName.end(); itor++)
330 {
331 restrictions.insert(*itor);
332 }
333 break;
334 case all:
335 for (itor = restrictionsAll.begin(); itor !=
336 restrictionsAll.end(); itor++)
337 {
338 restrictions.insert(*itor);
339 }
340 break;
341 default:
342 break;
343 }
344
345 cout << "done.\n";
346 }
347
348 http.clear();
349
350 checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
351 url.getPort() : "");
352 }