ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Indexer.cpp
Revision: 14
Committed: 2002-12-07T00:19:03-08:00 (22 years, 6 months ago) by douglas
File size: 7057 byte(s)
Log Message:
Renamed HttpHandler.connect() to HttpHandler.handle().
Implemented client header sending.
Changed indexing output positions in code.

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Indexer
46     //
47     // Douglas Thrift
48     //
49     // Indexer.cpp
50    
51     #include "Indexer.h"
52    
53     Indexer::Indexer(string& indexFile, set<string>& domains,
54     set<string>& restrictions)
55     {
56     this->indexFile = indexFile;
57     this->domains = domains;
58     this->restrictions = restrictions;
59     }
60    
61     void Indexer::index(string& begin)
62     {
63     ofstream fout(indexFile.c_str());
64    
65     fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
66     << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
67     << "<index>\n";
68    
69     URL first(begin);
70    
71     index(first, fout);
72    
73     fout << "</index>\n";
74    
75     fout.close();
76     }
77    
78     void Indexer::index(URL& url, ofstream& fout)
79     {
80     if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
81     url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
82     pages.end())
83     {
84     if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
85     url.getPort() : "") == checked.end())
86     {
87     robots(url);
88     }
89    
90     if (!restricted(url))
91     {
92 douglas 14 if (http.handle(url))
93 douglas 1 {
94     if (http.good())
95     {
96 douglas 11 cout << "Indexing " << url << "..." << flush;
97 douglas 1
98     if (processor.process(http, url))
99     {
100     Page page = processor.getPage();
101     fout << page << "\n";
102    
103     cout << "done.\n";
104     }
105     else
106     {
107     cout << "canceled.\n";
108     }
109    
110     pages.insert(url.getURL());
111     Set pageLinks = processor.getLinks();
112     processor.reset();
113    
114     for (SetIterator link = pageLinks.begin(); link !=
115     pageLinks.end(); link++)
116     {
117     if (pages.find(*link) == pages.end())
118     {
119     links.push(URL(*link));
120     }
121     }
122     }
123     }
124    
125     http.clear();
126     }
127     }
128    
129     if (!links.empty())
130     {
131     URL next = links.front();
132     links.pop();
133    
134     if (debug) cerr << "next = " << next << "\n";
135    
136     index(next, fout);
137     }
138     }
139    
140     bool Indexer::restricted(URL& url)
141     {
142     bool answer = false;
143    
144     for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
145     itor++)
146     {
147     URL checker = *itor;
148    
149     if (url.getAddress() == checker.getAddress() && url.getPort() ==
150     checker.getPort())
151     {
152     if (url.getPath().find(checker.getPath()) == 0)
153     {
154     answer = true;
155     break;
156     }
157     }
158     }
159    
160     return answer;
161     }
162    
163     void Indexer::robots(URL& url)
164     {
165     URL robots = url;
166 douglas 12 robots.setPath("/robots.txt");
167 douglas 1
168 douglas 14 if (http.handle(robots))
169     {
170     cout << "Checking " << robots << "..." << flush;
171 douglas 1
172     string line;
173     do http.getline(line); while (http.good() && line != "");
174    
175     bool record = false, hasVersion = false, hasName = false, hasAll =
176     false;
177     robot state = none;
178     Set restrictionsVersion, restrictionsName, restrictionsAll;
179    
180     while (http.good())
181     {
182     http.getline(line);
183    
184     unsigned comment = line.find('#');
185     if (comment != string::npos) line.erase(comment);
186    
187     if (line == "" && comment == string::npos) record = false;
188     if (line == "") continue;
189    
190     unsigned colon = line.find(':');
191    
192     string field = line.substr(0, colon);
193     string value = line.substr(colon + 1);
194    
195     normalize(value);
196    
197 douglas 12 if (field == "User-agent" && value == agent(true))
198 douglas 1 {
199     state = version;
200     record = true;
201     hasVersion = true;
202     }
203 douglas 12 else if (field == "User-agent" && value == agent(false))
204 douglas 1 {
205     state = name;
206     record = true;
207     hasName = true;
208     }
209     else if (field == "User-agent" && value == "*")
210     {
211     state = all;
212     record = true;
213     hasAll = true;
214     }
215     else if (field == "Disallow" && record && value == "")
216     {
217     // no restrictions
218     }
219     else if (field == "Disallow" && record)
220     {
221     URL restriction = robots;
222     restriction.setPath(value);
223    
224     switch (state)
225     {
226     case version:
227     restrictionsVersion.insert(restriction.getURL());
228     break;
229     case name:
230     restrictionsName.insert(restriction.getURL());
231     break;
232     case all:
233     restrictionsAll.insert(restriction.getURL());
234     break;
235     default:
236     break;
237     }
238     }
239     }
240    
241     if (hasVersion)
242     {
243     state = version;
244     }
245     else if (hasName)
246     {
247     state = name;
248     }
249     else if (hasAll)
250     {
251     state = all;
252     }
253     else
254     {
255     state = none;
256     }
257    
258     SetIterator itor;
259     switch (state)
260     {
261     case version:
262     for (itor = restrictionsVersion.begin(); itor !=
263     restrictionsVersion.end(); itor++)
264     {
265     restrictions.insert(*itor);
266     }
267     break;
268     case name:
269     for (itor = restrictionsName.begin(); itor !=
270     restrictionsName.end(); itor++)
271     {
272     restrictions.insert(*itor);
273     }
274     break;
275     case all:
276     for (itor = restrictionsAll.begin(); itor !=
277     restrictionsAll.end(); itor++)
278     {
279     restrictions.insert(*itor);
280     }
281     break;
282     default:
283     break;
284     }
285 douglas 14
286     cout << "done.\n";
287 douglas 1 }
288    
289     http.clear();
290    
291     checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
292     url.getPort() : "");
293     }