ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 25
Committed: 2002-12-22T23:32:58-08:00 (22 years, 6 months ago) by douglas
File size: 7516 byte(s)
Log Message:
Added "referer" handling to Indexer and HttpHandler.handle().

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49     // Indexer.cpp
50    
51     #include "Indexer.h"
52    
53     Indexer::Indexer(string& indexFile, set<string>& domains,
54     set<string>& restrictions)
55     {
56     this->indexFile = indexFile;
57     this->domains = domains;
58     this->restrictions = restrictions;
59     }
60    
61     void Indexer::index(string& begin)
62     {
63     ofstream fout(indexFile.c_str());
64    
65     fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
66     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
67     << "<index>\n";
68    
69     URL first(begin);
70    
71     index(first, fout);
72    
73     fout << "</index>\n";
74    
75     fout.close();
76     }
77    
78 douglas 25 void Indexer::index(URL& url, ofstream& fout, const string referer)
79 douglas 1 {
80     if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
81     url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
82     pages.end())
83     {
84     if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
85     url.getPort() : "") == checked.end())
86     {
87     robots(url);
88     }
89    
90     if (!restricted(url))
91     {
92 douglas 25 if (http.handle(url, referer, true))
93 douglas 1 {
94 douglas 17 if (http.contentType().find("text/plain") == 0 ||
95     http.contentType().find("text/html") == 0)
96 douglas 1 {
97 douglas 17 http.clear();
98 douglas 25 if (!http.handle(url, referer)) exit(1);
99 douglas 17
100 douglas 11 cout << "Indexing " << url << "..." << flush;
101 douglas 1
102     if (processor.process(http, url))
103     {
104     Page page = processor.getPage();
105     fout << page << "\n";
106    
107     cout << "done.\n";
108     }
109     else
110     {
111     cout << "canceled.\n";
112     }
113    
114     pages.insert(url.getURL());
115     Set pageLinks = processor.getLinks();
116     processor.reset();
117    
118     for (SetIterator link = pageLinks.begin(); link !=
119     pageLinks.end(); link++)
120     {
121     if (pages.find(*link) == pages.end())
122     {
123     links.push(URL(*link));
124 douglas 25 referers.push(url.getURL());
125 douglas 1 }
126     }
127     }
128 douglas 17 else
129     {
130     // unhandled content
131     }
132 douglas 1 }
133 douglas 17 else if (http.redirect() != "")
134     {
135     if (pages.find(http.redirect()) == pages.end())
136     {
137     links.push(URL(http.redirect()));
138 douglas 25 referers.push(url.getURL());
139 douglas 17 }
140     }
141 douglas 1
142     http.clear();
143     }
144     }
145    
146     if (!links.empty())
147     {
148     URL next = links.front();
149     links.pop();
150    
151 douglas 25 string referer = referers.front();
152     referers.pop();
153    
154 douglas 1 if (debug) cerr << "next = " << next << "\n";
155    
156 douglas 25 index(next, fout, referer);
157 douglas 1 }
158     }
159    
160     bool Indexer::restricted(URL& url)
161     {
162     bool answer = false;
163    
164     for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
165     itor++)
166     {
167     URL checker = *itor;
168    
169     if (url.getAddress() == checker.getAddress() && url.getPort() ==
170     checker.getPort())
171     {
172     if (url.getPath().find(checker.getPath()) == 0)
173     {
174     answer = true;
175     break;
176     }
177     }
178     }
179    
180     return answer;
181     }
182    
183     void Indexer::robots(URL& url)
184     {
185     URL robots = url;
186 douglas 12 robots.setPath("/robots.txt");
187 douglas 1
188 douglas 14 if (http.handle(robots))
189     {
190     cout << "Checking " << robots << "..." << flush;
191 douglas 1
192     string line;
193    
194     bool record = false, hasVersion = false, hasName = false, hasAll =
195     false;
196     robot state = none;
197     Set restrictionsVersion, restrictionsName, restrictionsAll;
198    
199     while (http.good())
200     {
201     http.getline(line);
202    
203     unsigned comment = line.find('#');
204     if (comment != string::npos) line.erase(comment);
205    
206     if (line == "" && comment == string::npos) record = false;
207     if (line == "") continue;
208    
209     unsigned colon = line.find(':');
210    
211     string field = line.substr(0, colon);
212     string value = line.substr(colon + 1);
213    
214     normalize(value);
215    
216 douglas 12 if (field == "User-agent" && value == agent(true))
217 douglas 1 {
218     state = version;
219     record = true;
220     hasVersion = true;
221     }
222 douglas 12 else if (field == "User-agent" && value == agent(false))
223 douglas 1 {
224     state = name;
225     record = true;
226     hasName = true;
227     }
228     else if (field == "User-agent" && value == "*")
229     {
230     state = all;
231     record = true;
232     hasAll = true;
233     }
234     else if (field == "Disallow" && record && value == "")
235     {
236     // no restrictions
237     }
238     else if (field == "Disallow" && record)
239     {
240     URL restriction = robots;
241     restriction.setPath(value);
242    
243     switch (state)
244     {
245     case version:
246     restrictionsVersion.insert(restriction.getURL());
247     break;
248     case name:
249     restrictionsName.insert(restriction.getURL());
250     break;
251     case all:
252     restrictionsAll.insert(restriction.getURL());
253     break;
254     default:
255     break;
256     }
257     }
258     }
259    
260     if (hasVersion)
261     {
262     state = version;
263     }
264     else if (hasName)
265     {
266     state = name;
267     }
268     else if (hasAll)
269     {
270     state = all;
271     }
272     else
273     {
274     state = none;
275     }
276    
277     SetIterator itor;
278     switch (state)
279     {
280     case version:
281     for (itor = restrictionsVersion.begin(); itor !=
282     restrictionsVersion.end(); itor++)
283     {
284     restrictions.insert(*itor);
285     }
286     break;
287     case name:
288     for (itor = restrictionsName.begin(); itor !=
289     restrictionsName.end(); itor++)
290     {
291     restrictions.insert(*itor);
292     }
293     break;
294     case all:
295     for (itor = restrictionsAll.begin(); itor !=
296     restrictionsAll.end(); itor++)
297     {
298     restrictions.insert(*itor);
299     }
300     break;
301     default:
302     break;
303     }
304 douglas 14
305     cout << "done.\n";
306 douglas 1 }
307    
308     http.clear();
309    
310     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
311     url.getPort() : "");
312     }