ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 20
Committed: 2002-12-10T14:04:39-08:00 (22 years, 6 months ago) by douglas
File size: 7341 byte(s)
Log Message:
Implemented chunked encoding handling.
Rewrote HttpHandler.good() and public HttpHandler.getline() functions.
There is a bug somewhere were something isn't always checking
HttpHandler.good()!

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49     // Indexer.cpp
50    
51     #include "Indexer.h"
52    
53     Indexer::Indexer(string& indexFile, set<string>& domains,
54     set<string>& restrictions)
55     {
56     this->indexFile = indexFile;
57     this->domains = domains;
58     this->restrictions = restrictions;
59     }
60    
61     void Indexer::index(string& begin)
62     {
63     ofstream fout(indexFile.c_str());
64    
65     fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
66     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
67     << "<index>\n";
68    
69     URL first(begin);
70    
71     index(first, fout);
72    
73     fout << "</index>\n";
74    
75     fout.close();
76     }
77    
78     void Indexer::index(URL& url, ofstream& fout)
79     {
80     if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
81     url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
82     pages.end())
83     {
84     if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
85     url.getPort() : "") == checked.end())
86     {
87     robots(url);
88     }
89    
90     if (!restricted(url))
91     {
92 douglas 17 if (http.handle(url, true))
93 douglas 1 {
94 douglas 17 if (http.contentType().find("text/plain") == 0 ||
95     http.contentType().find("text/html") == 0)
96 douglas 1 {
97 douglas 17 http.clear();
98     if (!http.handle(url)) exit(1);
99    
100 douglas 11 cout << "Indexing " << url << "..." << flush;
101 douglas 1
102     if (processor.process(http, url))
103     {
104     Page page = processor.getPage();
105     fout << page << "\n";
106    
107     cout << "done.\n";
108     }
109     else
110     {
111     cout << "canceled.\n";
112     }
113    
114     pages.insert(url.getURL());
115     Set pageLinks = processor.getLinks();
116     processor.reset();
117    
118     for (SetIterator link = pageLinks.begin(); link !=
119     pageLinks.end(); link++)
120     {
121     if (pages.find(*link) == pages.end())
122     {
123     links.push(URL(*link));
124     }
125     }
126     }
127 douglas 17 else
128     {
129     // unhandled content
130     }
131 douglas 1 }
132 douglas 17 else if (http.redirect() != "")
133     {
134     if (pages.find(http.redirect()) == pages.end())
135     {
136     links.push(URL(http.redirect()));
137     }
138     }
139 douglas 1
140     http.clear();
141     }
142     }
143    
144     if (!links.empty())
145     {
146     URL next = links.front();
147     links.pop();
148    
149     if (debug) cerr << "next = " << next << "\n";
150    
151     index(next, fout);
152     }
153     }
154    
155     bool Indexer::restricted(URL& url)
156     {
157     bool answer = false;
158    
159     for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
160     itor++)
161     {
162     URL checker = *itor;
163    
164     if (url.getAddress() == checker.getAddress() && url.getPort() ==
165     checker.getPort())
166     {
167     if (url.getPath().find(checker.getPath()) == 0)
168     {
169     answer = true;
170     break;
171     }
172     }
173     }
174    
175     return answer;
176     }
177    
178     void Indexer::robots(URL& url)
179     {
180     URL robots = url;
181 douglas 12 robots.setPath("/robots.txt");
182 douglas 1
183 douglas 14 if (http.handle(robots))
184     {
185     cout << "Checking " << robots << "..." << flush;
186 douglas 1
187     string line;
188    
189     bool record = false, hasVersion = false, hasName = false, hasAll =
190     false;
191     robot state = none;
192     Set restrictionsVersion, restrictionsName, restrictionsAll;
193    
194     while (http.good())
195     {
196     http.getline(line);
197    
198     unsigned comment = line.find('#');
199     if (comment != string::npos) line.erase(comment);
200    
201     if (line == "" && comment == string::npos) record = false;
202     if (line == "") continue;
203    
204     unsigned colon = line.find(':');
205    
206     string field = line.substr(0, colon);
207     string value = line.substr(colon + 1);
208    
209     normalize(value);
210    
211 douglas 12 if (field == "User-agent" && value == agent(true))
212 douglas 1 {
213     state = version;
214     record = true;
215     hasVersion = true;
216     }
217 douglas 12 else if (field == "User-agent" && value == agent(false))
218 douglas 1 {
219     state = name;
220     record = true;
221     hasName = true;
222     }
223     else if (field == "User-agent" && value == "*")
224     {
225     state = all;
226     record = true;
227     hasAll = true;
228     }
229     else if (field == "Disallow" && record && value == "")
230     {
231     // no restrictions
232     }
233     else if (field == "Disallow" && record)
234     {
235     URL restriction = robots;
236     restriction.setPath(value);
237    
238     switch (state)
239     {
240     case version:
241     restrictionsVersion.insert(restriction.getURL());
242     break;
243     case name:
244     restrictionsName.insert(restriction.getURL());
245     break;
246     case all:
247     restrictionsAll.insert(restriction.getURL());
248     break;
249     default:
250     break;
251     }
252     }
253     }
254    
255     if (hasVersion)
256     {
257     state = version;
258     }
259     else if (hasName)
260     {
261     state = name;
262     }
263     else if (hasAll)
264     {
265     state = all;
266     }
267     else
268     {
269     state = none;
270     }
271    
272     SetIterator itor;
273     switch (state)
274     {
275     case version:
276     for (itor = restrictionsVersion.begin(); itor !=
277     restrictionsVersion.end(); itor++)
278     {
279     restrictions.insert(*itor);
280     }
281     break;
282     case name:
283     for (itor = restrictionsName.begin(); itor !=
284     restrictionsName.end(); itor++)
285     {
286     restrictions.insert(*itor);
287     }
288     break;
289     case all:
290     for (itor = restrictionsAll.begin(); itor !=
291     restrictionsAll.end(); itor++)
292     {
293     restrictions.insert(*itor);
294     }
295     break;
296     default:
297     break;
298     }
299 douglas 14
300     cout << "done.\n";
301 douglas 1 }
302    
303     http.clear();
304    
305     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
306     url.getPort() : "");
307     }