ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 199
Committed: 2003-07-15T00:22:06-07:00 (21 years, 11 months ago) by douglas
File size: 8575 byte(s)
Log Message:
Did more OpenSSL stuff, like the version output.

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 28 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49 douglas 199 // $Id: Indexer.cpp,v 1.13 2003/07/15 07:22:06 douglas Exp $
50 douglas 1
51     #include "Indexer.h"
52    
53 douglas 37 #ifndef _WIN32
54     #include <unistd.h>
55 douglas 197 #else // _WIN32
56     inline int unlink(const char* filename) { return DeleteFile(filename); }
57 douglas 37 #endif // _WIN32
58    
59     Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
60     restrictions)
61 douglas 1 {
62     this->indexFile = indexFile;
63     this->domains = domains;
64     this->restrictions = restrictions;
65     }
66    
67     void Indexer::index(string& begin)
68     {
69 douglas 35 unsigned separator = indexFile.rfind(slash);
70     string dtd = separator != string::npos ? indexFile.substr(0, separator) +
71     slash + "index.dtd" : "index.dtd";
72    
73     ifstream fin(dtd.c_str());
74    
75     if (!fin.is_open())
76     {
77     ofstream fout(dtd.c_str());
78    
79     fout << "<!ELEMENT index (page*)>\n"
80 douglas 199 << "<!ELEMENT page (address, port?, tls?, path, title?, descriptio"
81     << "n?, keywords?, text, heading*)>\n"
82 douglas 35 << "<!ELEMENT address (#PCDATA)>\n"
83     << "<!ELEMENT port (#PCDATA)>\n"
84 douglas 199 << "<!ELEMENT tls (#PCDATA)>\n"
85 douglas 35 << "<!ELEMENT path (#PCDATA)>\n"
86     << "<!ELEMENT size (#PCDATA)>\n"
87     << "<!ELEMENT title (#PCDATA)>\n"
88     << "<!ELEMENT description (#PCDATA)>\n"
89     << "<!ELEMENT text (#PCDATA)>\n"
90     << "<!ELEMENT heading (#PCDATA)>\n";
91    
92     fout.close();
93     }
94    
95     fin.close();
96    
97 douglas 37 string lock = indexFile + ".lock";
98 douglas 1
99 douglas 37 ofstream fout(lock.c_str());
100     fout.close();
101     fout.open(indexFile.c_str());
102    
103 douglas 1 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
104     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
105     << "<index>\n";
106    
107     URL first(begin);
108    
109     index(first, fout);
110    
111     fout << "</index>\n";
112    
113     fout.close();
114 douglas 37
115     unlink(lock.c_str());
116 douglas 1 }
117    
118 douglas 25 void Indexer::index(URL& url, ofstream& fout, const string referer)
119 douglas 1 {
120     if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
121     url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
122     pages.end())
123     {
124     if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
125     url.getPort() : "") == checked.end())
126     {
127     robots(url);
128     }
129    
130     if (!restricted(url))
131     {
132 douglas 25 if (http.handle(url, referer, true))
133 douglas 1 {
134 douglas 17 if (http.contentType().find("text/plain") == 0 ||
135     http.contentType().find("text/html") == 0)
136 douglas 1 {
137 douglas 17 http.clear();
138 douglas 25 if (!http.handle(url, referer)) exit(1);
139 douglas 28
140 douglas 11 cout << "Indexing " << url << "..." << flush;
141 douglas 1
142     if (processor.process(http, url))
143     {
144     Page page = processor.getPage();
145     fout << page << "\n";
146    
147     cout << "done.\n";
148     }
149     else
150     {
151     cout << "canceled.\n";
152     }
153    
154     pages.insert(url.getURL());
155     Set pageLinks = processor.getLinks();
156     processor.reset();
157    
158     for (SetIterator link = pageLinks.begin(); link !=
159     pageLinks.end(); link++)
160     {
161     if (pages.find(*link) == pages.end())
162     {
163     links.push(URL(*link));
164 douglas 25 referers.push(url.getURL());
165 douglas 1 }
166     }
167     }
168 douglas 17 else
169     {
170     // unhandled content
171     }
172 douglas 1 }
173 douglas 17 else if (http.redirect() != "")
174     {
175     if (pages.find(http.redirect()) == pages.end())
176     {
177     links.push(URL(http.redirect()));
178 douglas 25 referers.push(url.getURL());
179 douglas 17 }
180     }
181 douglas 1
182     http.clear();
183     }
184     }
185    
186     if (!links.empty())
187     {
188     URL next = links.front();
189     links.pop();
190    
191 douglas 25 string referer = referers.front();
192     referers.pop();
193    
194 douglas 1 if (debug) cerr << "next = " << next << "\n";
195    
196 douglas 25 index(next, fout, referer);
197 douglas 1 }
198     }
199    
200     bool Indexer::restricted(URL& url)
201     {
202     bool answer = false;
203    
204     for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
205     itor++)
206     {
207     URL checker = *itor;
208    
209     if (url.getAddress() == checker.getAddress() && url.getPort() ==
210     checker.getPort())
211     {
212     if (url.getPath().find(checker.getPath()) == 0)
213     {
214     answer = true;
215     break;
216     }
217     }
218     }
219    
220     return answer;
221     }
222    
223     void Indexer::robots(URL& url)
224     {
225     URL robots = url;
226 douglas 12 robots.setPath("/robots.txt");
227 douglas 1
228 douglas 14 if (http.handle(robots))
229     {
230     cout << "Checking " << robots << "..." << flush;
231 douglas 1
232     string line;
233    
234     bool record = false, hasVersion = false, hasName = false, hasAll =
235     false;
236     robot state = none;
237     Set restrictionsVersion, restrictionsName, restrictionsAll;
238    
239     while (http.good())
240     {
241     http.getline(line);
242    
243     unsigned comment = line.find('#');
244     if (comment != string::npos) line.erase(comment);
245    
246     if (line == "" && comment == string::npos) record = false;
247     if (line == "") continue;
248    
249     unsigned colon = line.find(':');
250    
251     string field = line.substr(0, colon);
252     string value = line.substr(colon + 1);
253    
254     normalize(value);
255    
256 douglas 12 if (field == "User-agent" && value == agent(true))
257 douglas 1 {
258     state = version;
259     record = true;
260     hasVersion = true;
261     }
262 douglas 12 else if (field == "User-agent" && value == agent(false))
263 douglas 1 {
264     state = name;
265     record = true;
266     hasName = true;
267     }
268     else if (field == "User-agent" && value == "*")
269     {
270     state = all;
271     record = true;
272     hasAll = true;
273     }
274     else if (field == "Disallow" && record && value == "")
275     {
276     // no restrictions
277     }
278     else if (field == "Disallow" && record)
279     {
280     URL restriction = robots;
281     restriction.setPath(value);
282    
283     switch (state)
284     {
285     case version:
286     restrictionsVersion.insert(restriction.getURL());
287     break;
288     case name:
289     restrictionsName.insert(restriction.getURL());
290     break;
291     case all:
292     restrictionsAll.insert(restriction.getURL());
293     break;
294     default:
295     break;
296     }
297     }
298     }
299    
300     if (hasVersion)
301     {
302     state = version;
303     }
304     else if (hasName)
305     {
306     state = name;
307     }
308     else if (hasAll)
309     {
310     state = all;
311     }
312     else
313     {
314     state = none;
315     }
316    
317     SetIterator itor;
318     switch (state)
319     {
320     case version:
321     for (itor = restrictionsVersion.begin(); itor !=
322     restrictionsVersion.end(); itor++)
323     {
324     restrictions.insert(*itor);
325     }
326     break;
327     case name:
328     for (itor = restrictionsName.begin(); itor !=
329     restrictionsName.end(); itor++)
330     {
331     restrictions.insert(*itor);
332     }
333     break;
334     case all:
335     for (itor = restrictionsAll.begin(); itor !=
336     restrictionsAll.end(); itor++)
337     {
338     restrictions.insert(*itor);
339     }
340     break;
341     default:
342     break;
343     }
344 douglas 14
345     cout << "done.\n";
346 douglas 1 }
347    
348     http.clear();
349    
350     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
351     url.getPort() : "");
352     }