ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 334
Committed: 2004-04-05T16:37:41-07:00 (21 years, 2 months ago) by Douglas Thrift
File size: 8485 byte(s)
Log Message:
Ah, I just love Subversion!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4 douglas 312 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 douglas 1 * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49 Douglas Thrift 331 // $Id$
50 douglas 1
51 Douglas Thrift 334 #include "Indexer.hpp"
52 douglas 1
53 douglas 37 #ifndef _WIN32
54     #include <unistd.h>
55 douglas 197 #else // _WIN32
56     inline int unlink(const char* filename) { return DeleteFile(filename); }
57 douglas 37 #endif // _WIN32
58    
59     Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
60     restrictions)
61 douglas 1 {
62     this->indexFile = indexFile;
63     this->domains = domains;
64     this->restrictions = restrictions;
65     }
66    
67     void Indexer::index(string& begin)
68     {
69 douglas 35 unsigned separator = indexFile.rfind(slash);
70     string dtd = separator != string::npos ? indexFile.substr(0, separator) +
71     slash + "index.dtd" : "index.dtd";
72    
73     ifstream fin(dtd.c_str());
74    
75     if (!fin.is_open())
76     {
77     ofstream fout(dtd.c_str());
78    
79     fout << "<!ELEMENT index (page*)>\n"
80 douglas 199 << "<!ELEMENT page (address, port?, tls?, path, title?, descriptio"
81     << "n?, keywords?, text, heading*)>\n"
82 douglas 35 << "<!ELEMENT address (#PCDATA)>\n"
83     << "<!ELEMENT port (#PCDATA)>\n"
84 douglas 199 << "<!ELEMENT tls (#PCDATA)>\n"
85 douglas 35 << "<!ELEMENT path (#PCDATA)>\n"
86     << "<!ELEMENT size (#PCDATA)>\n"
87     << "<!ELEMENT title (#PCDATA)>\n"
88     << "<!ELEMENT description (#PCDATA)>\n"
89     << "<!ELEMENT text (#PCDATA)>\n"
90     << "<!ELEMENT heading (#PCDATA)>\n";
91    
92     fout.close();
93     }
94    
95     fin.close();
96    
97 douglas 37 string lock = indexFile + ".lock";
98 douglas 1
99 douglas 37 ofstream fout(lock.c_str());
100     fout.close();
101     fout.open(indexFile.c_str());
102    
103 douglas 1 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
104     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
105     << "<index>\n";
106    
107     URL first(begin);
108    
109     index(first, fout);
110    
111     fout << "</index>\n";
112    
113     fout.close();
114 douglas 37
115     unlink(lock.c_str());
116 douglas 1 }
117    
118 douglas 25 void Indexer::index(URL& url, ofstream& fout, const string referer)
119 douglas 1 {
120 douglas 225 if (domains.find(url.getAddress()) != domains.end() &&
121     pages.find(url.getURL()) == pages.end())
122 douglas 1 {
123 douglas 225 if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" +
124     url.getPort() : string(""))) == checked.end())
125 douglas 1 {
126     robots(url);
127     }
128    
129     if (!restricted(url))
130     {
131 douglas 25 if (http.handle(url, referer, true))
132 douglas 1 {
133 douglas 17 if (http.contentType().find("text/plain") == 0 ||
134     http.contentType().find("text/html") == 0)
135 douglas 1 {
136 douglas 17 http.clear();
137 douglas 25 if (!http.handle(url, referer)) exit(1);
138 douglas 28
139 douglas 203 cout << "Indexing " << url << " ... " << flush;
140 douglas 1
141     if (processor.process(http, url))
142     {
143     Page page = processor.getPage();
144     fout << page << "\n";
145    
146     cout << "done.\n";
147     }
148     else
149     {
150     cout << "canceled.\n";
151     }
152    
153     pages.insert(url.getURL());
154     Set pageLinks = processor.getLinks();
155     processor.reset();
156    
157     for (SetIterator link = pageLinks.begin(); link !=
158     pageLinks.end(); link++)
159     {
160     if (pages.find(*link) == pages.end())
161     {
162     links.push(URL(*link));
163 douglas 25 referers.push(url.getURL());
164 douglas 1 }
165     }
166     }
167 douglas 17 else
168     {
169     // unhandled content
170     }
171 douglas 1 }
172 douglas 17 else if (http.redirect() != "")
173     {
174     if (pages.find(http.redirect()) == pages.end())
175     {
176     links.push(URL(http.redirect()));
177 douglas 25 referers.push(url.getURL());
178 douglas 17 }
179     }
180 douglas 1
181     http.clear();
182     }
183     }
184    
185     if (!links.empty())
186     {
187     URL next = links.front();
188     links.pop();
189    
190 douglas 25 string referer = referers.front();
191     referers.pop();
192    
193 douglas 1 if (debug) cerr << "next = " << next << "\n";
194    
195 douglas 25 index(next, fout, referer);
196 douglas 1 }
197     }
198    
199     bool Indexer::restricted(URL& url)
200     {
201     bool answer = false;
202    
203     for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
204     itor++)
205     {
206     URL checker = *itor;
207    
208     if (url.getAddress() == checker.getAddress() && url.getPort() ==
209     checker.getPort())
210     {
211     if (url.getPath().find(checker.getPath()) == 0)
212     {
213     answer = true;
214     break;
215     }
216     }
217     }
218    
219     return answer;
220     }
221    
222     void Indexer::robots(URL& url)
223     {
224     URL robots = url;
225 douglas 12 robots.setPath("/robots.txt");
226 douglas 1
227 douglas 14 if (http.handle(robots))
228     {
229 douglas 203 cout << "Checking " << robots << " ... " << flush;
230 douglas 1
231     string line;
232    
233     bool record = false, hasVersion = false, hasName = false, hasAll =
234     false;
235     robot state = none;
236     Set restrictionsVersion, restrictionsName, restrictionsAll;
237    
238     while (http.good())
239     {
240     http.getline(line);
241    
242     unsigned comment = line.find('#');
243     if (comment != string::npos) line.erase(comment);
244    
245     if (line == "" && comment == string::npos) record = false;
246     if (line == "") continue;
247    
248     unsigned colon = line.find(':');
249    
250     string field = line.substr(0, colon);
251     string value = line.substr(colon + 1);
252    
253     normalize(value);
254    
255 douglas 12 if (field == "User-agent" && value == agent(true))
256 douglas 1 {
257     state = version;
258     record = true;
259     hasVersion = true;
260     }
261 douglas 12 else if (field == "User-agent" && value == agent(false))
262 douglas 1 {
263     state = name;
264     record = true;
265     hasName = true;
266     }
267     else if (field == "User-agent" && value == "*")
268     {
269     state = all;
270     record = true;
271     hasAll = true;
272     }
273     else if (field == "Disallow" && record && value == "")
274     {
275     // no restrictions
276     }
277     else if (field == "Disallow" && record)
278     {
279     URL restriction = robots;
280     restriction.setPath(value);
281    
282     switch (state)
283     {
284     case version:
285     restrictionsVersion.insert(restriction.getURL());
286     break;
287     case name:
288     restrictionsName.insert(restriction.getURL());
289     break;
290     case all:
291     restrictionsAll.insert(restriction.getURL());
292     break;
293     default:
294     break;
295     }
296     }
297     }
298    
299     if (hasVersion)
300     {
301     state = version;
302     }
303     else if (hasName)
304     {
305     state = name;
306     }
307     else if (hasAll)
308     {
309     state = all;
310     }
311     else
312     {
313     state = none;
314     }
315    
316     SetIterator itor;
317     switch (state)
318     {
319     case version:
320     for (itor = restrictionsVersion.begin(); itor !=
321     restrictionsVersion.end(); itor++)
322     {
323     restrictions.insert(*itor);
324     }
325     break;
326     case name:
327     for (itor = restrictionsName.begin(); itor !=
328     restrictionsName.end(); itor++)
329     {
330     restrictions.insert(*itor);
331     }
332     break;
333     case all:
334     for (itor = restrictionsAll.begin(); itor !=
335     restrictionsAll.end(); itor++)
336     {
337     restrictions.insert(*itor);
338     }
339     break;
340     default:
341     break;
342     }
343 douglas 14
344     cout << "done.\n";
345 douglas 1 }
346    
347     http.clear();
348    
349     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
350     url.getPort() : "");
351     }

Properties

Name Value
svn:eol-style native
svn:keywords Id