ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 35
Committed: 2003-01-16T17:27:03-08:00 (22 years, 5 months ago) by douglas
File size: 8266 byte(s)
Log Message:
Embedded DTD into the program and removed it from tree.
Added usage() to bad argument handlers.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // Indexer.cpp
50
51 #include "Indexer.h"
52
53 Indexer::Indexer(string& indexFile, set<string>& domains,
54 set<string>& restrictions)
55 {
56 this->indexFile = indexFile;
57 this->domains = domains;
58 this->restrictions = restrictions;
59 }
60
61 void Indexer::index(string& begin)
62 {
63 unsigned separator = indexFile.rfind(slash);
64 string dtd = separator != string::npos ? indexFile.substr(0, separator) +
65 slash + "index.dtd" : "index.dtd";
66
67 ifstream fin(dtd.c_str());
68
69 if (!fin.is_open())
70 {
71 ofstream fout(dtd.c_str());
72
73 fout << "<!ELEMENT index (page*)>\n"
74 << "<!ELEMENT page (address, port?, path, title?, description?, ke"
75 << "ywords?, text,\n"
76 << " heading*)\n"
77 << ">\n"
78 << "<!ELEMENT address (#PCDATA)>\n"
79 << "<!ELEMENT port (#PCDATA)>\n"
80 << "<!ELEMENT path (#PCDATA)>\n"
81 << "<!ELEMENT size (#PCDATA)>\n"
82 << "<!ELEMENT title (#PCDATA)>\n"
83 << "<!ELEMENT description (#PCDATA)>\n"
84 << "<!ELEMENT text (#PCDATA)>\n"
85 << "<!ELEMENT heading (#PCDATA)>\n";
86
87 fout.close();
88 }
89
90 fin.close();
91
92 ofstream fout(indexFile.c_str());
93
94 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
95 << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
96 << "<index>\n";
97
98 URL first(begin);
99
100 index(first, fout);
101
102 fout << "</index>\n";
103
104 fout.close();
105 }
106
107 void Indexer::index(URL& url, ofstream& fout, const string referer)
108 {
109 if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
110 url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
111 pages.end())
112 {
113 if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
114 url.getPort() : "") == checked.end())
115 {
116 robots(url);
117 }
118
119 if (!restricted(url))
120 {
121 if (http.handle(url, referer, true))
122 {
123 if (http.contentType().find("text/plain") == 0 ||
124 http.contentType().find("text/html") == 0)
125 {
126 http.clear();
127 if (!http.handle(url, referer)) exit(1);
128
129 cout << "Indexing " << url << "..." << flush;
130
131 if (processor.process(http, url))
132 {
133 Page page = processor.getPage();
134 fout << page << "\n";
135
136 cout << "done.\n";
137 }
138 else
139 {
140 cout << "canceled.\n";
141 }
142
143 pages.insert(url.getURL());
144 Set pageLinks = processor.getLinks();
145 processor.reset();
146
147 for (SetIterator link = pageLinks.begin(); link !=
148 pageLinks.end(); link++)
149 {
150 if (pages.find(*link) == pages.end())
151 {
152 links.push(URL(*link));
153 referers.push(url.getURL());
154 }
155 }
156 }
157 else
158 {
159 // unhandled content
160 }
161 }
162 else if (http.redirect() != "")
163 {
164 if (pages.find(http.redirect()) == pages.end())
165 {
166 links.push(URL(http.redirect()));
167 referers.push(url.getURL());
168 }
169 }
170
171 http.clear();
172 }
173 }
174
175 if (!links.empty())
176 {
177 URL next = links.front();
178 links.pop();
179
180 string referer = referers.front();
181 referers.pop();
182
183 if (debug) cerr << "next = " << next << "\n";
184
185 index(next, fout, referer);
186 }
187 }
188
189 bool Indexer::restricted(URL& url)
190 {
191 bool answer = false;
192
193 for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
194 itor++)
195 {
196 URL checker = *itor;
197
198 if (url.getAddress() == checker.getAddress() && url.getPort() ==
199 checker.getPort())
200 {
201 if (url.getPath().find(checker.getPath()) == 0)
202 {
203 answer = true;
204 break;
205 }
206 }
207 }
208
209 return answer;
210 }
211
212 void Indexer::robots(URL& url)
213 {
214 URL robots = url;
215 robots.setPath("/robots.txt");
216
217 if (http.handle(robots))
218 {
219 cout << "Checking " << robots << "..." << flush;
220
221 string line;
222
223 bool record = false, hasVersion = false, hasName = false, hasAll =
224 false;
225 robot state = none;
226 Set restrictionsVersion, restrictionsName, restrictionsAll;
227
228 while (http.good())
229 {
230 http.getline(line);
231
232 unsigned comment = line.find('#');
233 if (comment != string::npos) line.erase(comment);
234
235 if (line == "" && comment == string::npos) record = false;
236 if (line == "") continue;
237
238 unsigned colon = line.find(':');
239
240 string field = line.substr(0, colon);
241 string value = line.substr(colon + 1);
242
243 normalize(value);
244
245 if (field == "User-agent" && value == agent(true))
246 {
247 state = version;
248 record = true;
249 hasVersion = true;
250 }
251 else if (field == "User-agent" && value == agent(false))
252 {
253 state = name;
254 record = true;
255 hasName = true;
256 }
257 else if (field == "User-agent" && value == "*")
258 {
259 state = all;
260 record = true;
261 hasAll = true;
262 }
263 else if (field == "Disallow" && record && value == "")
264 {
265 // no restrictions
266 }
267 else if (field == "Disallow" && record)
268 {
269 URL restriction = robots;
270 restriction.setPath(value);
271
272 switch (state)
273 {
274 case version:
275 restrictionsVersion.insert(restriction.getURL());
276 break;
277 case name:
278 restrictionsName.insert(restriction.getURL());
279 break;
280 case all:
281 restrictionsAll.insert(restriction.getURL());
282 break;
283 default:
284 break;
285 }
286 }
287 }
288
289 if (hasVersion)
290 {
291 state = version;
292 }
293 else if (hasName)
294 {
295 state = name;
296 }
297 else if (hasAll)
298 {
299 state = all;
300 }
301 else
302 {
303 state = none;
304 }
305
306 SetIterator itor;
307 switch (state)
308 {
309 case version:
310 for (itor = restrictionsVersion.begin(); itor !=
311 restrictionsVersion.end(); itor++)
312 {
313 restrictions.insert(*itor);
314 }
315 break;
316 case name:
317 for (itor = restrictionsName.begin(); itor !=
318 restrictionsName.end(); itor++)
319 {
320 restrictions.insert(*itor);
321 }
322 break;
323 case all:
324 for (itor = restrictionsAll.begin(); itor !=
325 restrictionsAll.end(); itor++)
326 {
327 restrictions.insert(*itor);
328 }
329 break;
330 default:
331 break;
332 }
333
334 cout << "done.\n";
335 }
336
337 http.clear();
338
339 checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
340 url.getPort() : "");
341 }