ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 355
Committed: 2004-06-04T04:08:28-07:00 (21 years ago) by Douglas Thrift
File size: 8241 byte(s)
Log Message:
I missed some C++ifying!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Indexer.hpp"
52 douglas 1
53 douglas 37 #ifndef _WIN32
54     #include <unistd.h>
55 douglas 197 #else // _WIN32
56     inline int unlink(const char* filename) { return DeleteFile(filename); }
57 douglas 37 #endif // _WIN32
58    
59 douglas 1 void Indexer::index(string& begin)
60     {
61 Douglas Thrift 348 unsigned separator(indexFile.rfind(slash));
62     string dtd(separator != string::npos ? indexFile.substr(0, separator) +
63     slash + "index.dtd" : "index.dtd");
64 douglas 35 ifstream fin(dtd.c_str());
65    
66     if (!fin.is_open())
67     {
68     ofstream fout(dtd.c_str());
69    
70     fout << "<!ELEMENT index (page*)>\n"
71 douglas 199 << "<!ELEMENT page (address, port?, tls?, path, title?, descriptio"
72     << "n?, keywords?, text, heading*)>\n"
73 douglas 35 << "<!ELEMENT address (#PCDATA)>\n"
74     << "<!ELEMENT port (#PCDATA)>\n"
75 douglas 199 << "<!ELEMENT tls (#PCDATA)>\n"
76 douglas 35 << "<!ELEMENT path (#PCDATA)>\n"
77     << "<!ELEMENT size (#PCDATA)>\n"
78     << "<!ELEMENT title (#PCDATA)>\n"
79     << "<!ELEMENT description (#PCDATA)>\n"
80     << "<!ELEMENT text (#PCDATA)>\n"
81     << "<!ELEMENT heading (#PCDATA)>\n";
82    
83     fout.close();
84     }
85    
86     fin.close();
87    
88 Douglas Thrift 348 string lock(indexFile + ".lock");
89     ofstream fout(lock.c_str());
90 douglas 1
91 douglas 37 fout.close();
92     fout.open(indexFile.c_str());
93    
94 douglas 1 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
95     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
96     << "<index>\n";
97    
98     URL first(begin);
99    
100     index(first, fout);
101 Douglas Thrift 348
102     while (!links.empty())
103     {
104     URL next(links.front());
105     string referer(referers.front());
106 douglas 1
107 Douglas Thrift 348 links.pop();
108     referers.pop();
109    
110     if (debug) cerr << "next = " << next << "\n";
111    
112     index(next, fout, referer);
113     }
114    
115 douglas 1 fout << "</index>\n";
116    
117     fout.close();
118 douglas 37
119     unlink(lock.c_str());
120 douglas 1 }
121    
122 Douglas Thrift 348 void Indexer::index(URL& url, ofstream& fout, const string& referer)
123 douglas 1 {
124 douglas 225 if (domains.find(url.getAddress()) != domains.end() &&
125     pages.find(url.getURL()) == pages.end())
126 douglas 1 {
127 douglas 225 if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" +
128     url.getPort() : string(""))) == checked.end())
129 douglas 1 {
130     robots(url);
131     }
132    
133     if (!restricted(url))
134     {
135 douglas 25 if (http.handle(url, referer, true))
136 douglas 1 {
137 douglas 17 if (http.contentType().find("text/plain") == 0 ||
138     http.contentType().find("text/html") == 0)
139 douglas 1 {
140 douglas 17 http.clear();
141 Douglas Thrift 348
142 douglas 25 if (!http.handle(url, referer)) exit(1);
143 douglas 28
144 douglas 203 cout << "Indexing " << url << " ... " << flush;
145 douglas 1
146     if (processor.process(http, url))
147     {
148 Douglas Thrift 348 Page page(processor.getPage());
149    
150 douglas 1 fout << page << "\n";
151    
152     cout << "done.\n";
153     }
154     else
155     {
156     cout << "canceled.\n";
157     }
158    
159     pages.insert(url.getURL());
160 Douglas Thrift 348
161     Set pageLinks(processor.getLinks());
162    
163 douglas 1 processor.reset();
164    
165 Douglas Thrift 348 for (SetIterator link(pageLinks.begin()); link !=
166 douglas 1 pageLinks.end(); link++)
167     {
168     if (pages.find(*link) == pages.end())
169     {
170 Douglas Thrift 348 links.push(*link);
171 douglas 25 referers.push(url.getURL());
172 douglas 1 }
173     }
174     }
175 douglas 17 else
176     {
177     // unhandled content
178     }
179 douglas 1 }
180 Douglas Thrift 355 else if (!http.redirect().empty())
181 douglas 17 {
182     if (pages.find(http.redirect()) == pages.end())
183     {
184 Douglas Thrift 348 links.push(http.redirect());
185 douglas 25 referers.push(url.getURL());
186 douglas 17 }
187     }
188 douglas 1
189     http.clear();
190     }
191     }
192     }
193    
194     bool Indexer::restricted(URL& url)
195     {
196 Douglas Thrift 348 bool answer(false);
197 douglas 1
198 Douglas Thrift 348 for (SetIterator itor(restrictions.begin()); itor != restrictions.end();
199 douglas 1 itor++)
200     {
201 Douglas Thrift 348 URL checker(*itor);
202 douglas 1
203     if (url.getAddress() == checker.getAddress() && url.getPort() ==
204     checker.getPort())
205     {
206     if (url.getPath().find(checker.getPath()) == 0)
207     {
208     answer = true;
209 Douglas Thrift 348
210 douglas 1 break;
211     }
212     }
213     }
214    
215     return answer;
216     }
217    
218     void Indexer::robots(URL& url)
219     {
220 Douglas Thrift 348 URL robots(url);
221    
222 douglas 12 robots.setPath("/robots.txt");
223 douglas 1
224 douglas 14 if (http.handle(robots))
225     {
226 douglas 203 cout << "Checking " << robots << " ... " << flush;
227 douglas 1
228     string line;
229 Douglas Thrift 348 bool record(false), hasVersion(false), hasName(false), hasAll(false);
230     robot state(none);
231 douglas 1 Set restrictionsVersion, restrictionsName, restrictionsAll;
232    
233     while (http.good())
234     {
235     http.getline(line);
236    
237 Douglas Thrift 348 unsigned comment(line.find('#'));
238    
239 douglas 1 if (comment != string::npos) line.erase(comment);
240    
241 Douglas Thrift 355 if (line.empty() && comment == string::npos) record = false;
242     if (line.empty()) continue;
243 douglas 1
244 Douglas Thrift 348 unsigned colon(line.find(':'));
245     string field(line.substr(0, colon));
246     string value(line.substr(colon + 1));
247 douglas 1
248     normalize(value);
249    
250 douglas 12 if (field == "User-agent" && value == agent(true))
251 douglas 1 {
252     state = version;
253     record = true;
254     hasVersion = true;
255     }
256 douglas 12 else if (field == "User-agent" && value == agent(false))
257 douglas 1 {
258     state = name;
259     record = true;
260     hasName = true;
261     }
262     else if (field == "User-agent" && value == "*")
263     {
264     state = all;
265     record = true;
266     hasAll = true;
267     }
268 Douglas Thrift 355 else if (field == "Disallow" && record && value.empty())
269 douglas 1 {
270     // no restrictions
271     }
272     else if (field == "Disallow" && record)
273     {
274 Douglas Thrift 348 URL restriction(robots);
275    
276 douglas 1 restriction.setPath(value);
277    
278     switch (state)
279     {
280     case version:
281     restrictionsVersion.insert(restriction.getURL());
282     break;
283     case name:
284     restrictionsName.insert(restriction.getURL());
285     break;
286     case all:
287     restrictionsAll.insert(restriction.getURL());
288     break;
289     }
290     }
291     }
292    
293     if (hasVersion)
294     {
295     state = version;
296     }
297     else if (hasName)
298     {
299     state = name;
300     }
301     else if (hasAll)
302     {
303     state = all;
304     }
305     else
306     {
307     state = none;
308     }
309    
310     SetIterator itor;
311 Douglas Thrift 348
312 douglas 1 switch (state)
313     {
314     case version:
315     for (itor = restrictionsVersion.begin(); itor !=
316     restrictionsVersion.end(); itor++)
317     {
318     restrictions.insert(*itor);
319     }
320     break;
321     case name:
322     for (itor = restrictionsName.begin(); itor !=
323     restrictionsName.end(); itor++)
324     {
325     restrictions.insert(*itor);
326     }
327     break;
328     case all:
329 Douglas Thrift 348 for (itor = restrictionsAll.begin(); itor != restrictionsAll.end();
330     itor++)
331 douglas 1 {
332     restrictions.insert(*itor);
333     }
334     break;
335     }
336 douglas 14
337     cout << "done.\n";
338 douglas 1 }
339    
340     http.clear();
341    
342     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
343     url.getPort() : "");
344     }

Properties

Name Value
svn:eol-style native
svn:keywords Id