1 |
/* ============================================================================ |
2 |
* Douglas Thrift's Search Engine License |
3 |
* |
4 |
* Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved. |
5 |
* Redistribution and use in source and binary forms, with or without |
6 |
* modification, are permitted provided that the following conditions are met: |
7 |
* |
8 |
* 1. Redistributions of source code must retain the above copyright notice, |
9 |
* this list of conditions and the following disclaimer. |
10 |
* |
11 |
* 2. Redistributions in binary form must reproduce the above copyright notice, |
12 |
* this list of conditions and the following disclaimer in the documentation |
13 |
* and/or other materials provided with the distribution. |
14 |
* |
15 |
* 3. The end-user documentation included with the redistribution, if any, must |
16 |
* include the following acknowledgment: |
17 |
* |
18 |
* "This product includes software developed by Douglas Thrift |
19 |
* (http://computers.douglasthrift.net/searchengine/)." |
20 |
* |
21 |
* Alternately, this acknowledgment may appear in the software itself, if |
22 |
* and wherever such third-party acknowledgments normally appear. |
23 |
* |
24 |
* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not |
25 |
* be used to endorse or promote products derived from this software without |
26 |
* specific prior written permission. For written permission, please visit |
27 |
* http://www.douglasthrift.net/contact.cgi for contact information. |
28 |
* |
29 |
* 5. Products derived from this software may not be called "Douglas Thrift's |
30 |
* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their |
31 |
* name, without prior written permission. |
32 |
* |
33 |
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
34 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
35 |
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
36 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
37 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
38 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, |
39 |
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
40 |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
41 |
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
42 |
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 |
* ============================================================================ |
44 |
*/ |
45 |
// Douglas Thrift's Search Engine Indexer |
46 |
// |
47 |
// Douglas Thrift |
48 |
// |
49 |
// $Id: Indexer.h,v 1.6 2004/01/01 23:00:34 douglas Exp $ |
50 |
|
51 |
#ifndef _Indexer_h_ |
52 |
#define _Indexer_h_ |
53 |
|
54 |
#include "Search.h" |
55 |
#include "URL.h" |
56 |
#include "Page.h" |
57 |
#include "HttpHandler.h" |
58 |
#include "Processor.h" |
59 |
|
60 |
typedef set<string> Set; |
61 |
typedef set<string>::iterator SetIterator; |
62 |
|
63 |
class Indexer |
64 |
{ |
65 |
private: |
66 |
enum robot { none, version, name, all }; |
67 |
HttpHandler http; |
68 |
Processor processor; |
69 |
Set pages; |
70 |
queue<URL> links; |
71 |
queue<string> referers; |
72 |
string indexFile; |
73 |
Set domains; |
74 |
Set restrictions; |
75 |
Set checked; |
76 |
void index(URL& url, ofstream& fout, const string referer = ""); |
77 |
bool restricted(URL& url); |
78 |
void robots(URL& url); |
79 |
public: |
80 |
Indexer(string& indexFile, set<string>& domains, set<string>& |
81 |
restrictions); |
82 |
~Indexer() {} |
83 |
void index(string& begin); |
84 |
}; |
85 |
|
86 |
#endif // _Indexer_h_ |