ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 37
Committed: 2003-01-16T22:24:01-08:00 (22 years, 5 months ago) by douglas
File size: 8421 byte(s)
Log Message:
Added index file locking during indexing and handling during searching.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // Indexer.cpp
50
51 #include "Indexer.h"
52
53 #ifndef _WIN32
54 #include <unistd.h>
55 #endif // _WIN32
56
57 Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
58 restrictions)
59 {
60 this->indexFile = indexFile;
61 this->domains = domains;
62 this->restrictions = restrictions;
63 }
64
65 void Indexer::index(string& begin)
66 {
67 unsigned separator = indexFile.rfind(slash);
68 string dtd = separator != string::npos ? indexFile.substr(0, separator) +
69 slash + "index.dtd" : "index.dtd";
70
71 ifstream fin(dtd.c_str());
72
73 if (!fin.is_open())
74 {
75 ofstream fout(dtd.c_str());
76
77 fout << "<!ELEMENT index (page*)>\n"
78 << "<!ELEMENT page (address, port?, path, title?, description?, ke"
79 << "ywords?, text,\n"
80 << " heading*)\n"
81 << ">\n"
82 << "<!ELEMENT address (#PCDATA)>\n"
83 << "<!ELEMENT port (#PCDATA)>\n"
84 << "<!ELEMENT path (#PCDATA)>\n"
85 << "<!ELEMENT size (#PCDATA)>\n"
86 << "<!ELEMENT title (#PCDATA)>\n"
87 << "<!ELEMENT description (#PCDATA)>\n"
88 << "<!ELEMENT text (#PCDATA)>\n"
89 << "<!ELEMENT heading (#PCDATA)>\n";
90
91 fout.close();
92 }
93
94 fin.close();
95
96 string lock = indexFile + ".lock";
97
98 ofstream fout(lock.c_str());
99 fout.close();
100 fout.open(indexFile.c_str());
101
102 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
103 << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
104 << "<index>\n";
105
106 URL first(begin);
107
108 index(first, fout);
109
110 fout << "</index>\n";
111
112 fout.close();
113
114 unlink(lock.c_str());
115 }
116
117 void Indexer::index(URL& url, ofstream& fout, const string referer)
118 {
119 if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
120 url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
121 pages.end())
122 {
123 if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
124 url.getPort() : "") == checked.end())
125 {
126 robots(url);
127 }
128
129 if (!restricted(url))
130 {
131 if (http.handle(url, referer, true))
132 {
133 if (http.contentType().find("text/plain") == 0 ||
134 http.contentType().find("text/html") == 0)
135 {
136 http.clear();
137 if (!http.handle(url, referer)) exit(1);
138
139 cout << "Indexing " << url << "..." << flush;
140
141 if (processor.process(http, url))
142 {
143 Page page = processor.getPage();
144 fout << page << "\n";
145
146 cout << "done.\n";
147 }
148 else
149 {
150 cout << "canceled.\n";
151 }
152
153 pages.insert(url.getURL());
154 Set pageLinks = processor.getLinks();
155 processor.reset();
156
157 for (SetIterator link = pageLinks.begin(); link !=
158 pageLinks.end(); link++)
159 {
160 if (pages.find(*link) == pages.end())
161 {
162 links.push(URL(*link));
163 referers.push(url.getURL());
164 }
165 }
166 }
167 else
168 {
169 // unhandled content
170 }
171 }
172 else if (http.redirect() != "")
173 {
174 if (pages.find(http.redirect()) == pages.end())
175 {
176 links.push(URL(http.redirect()));
177 referers.push(url.getURL());
178 }
179 }
180
181 http.clear();
182 }
183 }
184
185 if (!links.empty())
186 {
187 URL next = links.front();
188 links.pop();
189
190 string referer = referers.front();
191 referers.pop();
192
193 if (debug) cerr << "next = " << next << "\n";
194
195 index(next, fout, referer);
196 }
197 }
198
199 bool Indexer::restricted(URL& url)
200 {
201 bool answer = false;
202
203 for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
204 itor++)
205 {
206 URL checker = *itor;
207
208 if (url.getAddress() == checker.getAddress() && url.getPort() ==
209 checker.getPort())
210 {
211 if (url.getPath().find(checker.getPath()) == 0)
212 {
213 answer = true;
214 break;
215 }
216 }
217 }
218
219 return answer;
220 }
221
222 void Indexer::robots(URL& url)
223 {
224 URL robots = url;
225 robots.setPath("/robots.txt");
226
227 if (http.handle(robots))
228 {
229 cout << "Checking " << robots << "..." << flush;
230
231 string line;
232
233 bool record = false, hasVersion = false, hasName = false, hasAll =
234 false;
235 robot state = none;
236 Set restrictionsVersion, restrictionsName, restrictionsAll;
237
238 while (http.good())
239 {
240 http.getline(line);
241
242 unsigned comment = line.find('#');
243 if (comment != string::npos) line.erase(comment);
244
245 if (line == "" && comment == string::npos) record = false;
246 if (line == "") continue;
247
248 unsigned colon = line.find(':');
249
250 string field = line.substr(0, colon);
251 string value = line.substr(colon + 1);
252
253 normalize(value);
254
255 if (field == "User-agent" && value == agent(true))
256 {
257 state = version;
258 record = true;
259 hasVersion = true;
260 }
261 else if (field == "User-agent" && value == agent(false))
262 {
263 state = name;
264 record = true;
265 hasName = true;
266 }
267 else if (field == "User-agent" && value == "*")
268 {
269 state = all;
270 record = true;
271 hasAll = true;
272 }
273 else if (field == "Disallow" && record && value == "")
274 {
275 // no restrictions
276 }
277 else if (field == "Disallow" && record)
278 {
279 URL restriction = robots;
280 restriction.setPath(value);
281
282 switch (state)
283 {
284 case version:
285 restrictionsVersion.insert(restriction.getURL());
286 break;
287 case name:
288 restrictionsName.insert(restriction.getURL());
289 break;
290 case all:
291 restrictionsAll.insert(restriction.getURL());
292 break;
293 default:
294 break;
295 }
296 }
297 }
298
299 if (hasVersion)
300 {
301 state = version;
302 }
303 else if (hasName)
304 {
305 state = name;
306 }
307 else if (hasAll)
308 {
309 state = all;
310 }
311 else
312 {
313 state = none;
314 }
315
316 SetIterator itor;
317 switch (state)
318 {
319 case version:
320 for (itor = restrictionsVersion.begin(); itor !=
321 restrictionsVersion.end(); itor++)
322 {
323 restrictions.insert(*itor);
324 }
325 break;
326 case name:
327 for (itor = restrictionsName.begin(); itor !=
328 restrictionsName.end(); itor++)
329 {
330 restrictions.insert(*itor);
331 }
332 break;
333 case all:
334 for (itor = restrictionsAll.begin(); itor !=
335 restrictionsAll.end(); itor++)
336 {
337 restrictions.insert(*itor);
338 }
339 break;
340 default:
341 break;
342 }
343
344 cout << "done.\n";
345 }
346
347 http.clear();
348
349 checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
350 url.getPort() : "");
351 }