1 |
|
/* ============================================================================ |
2 |
|
* Douglas Thrift's Search Engine License |
3 |
|
* |
4 |
< |
* Copyright (C) 2002, Douglas Thrift. All Rights Reserved. |
4 |
> |
* Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. |
5 |
|
* Redistribution and use in source and binary forms, with or without |
6 |
|
* modification, are permitted provided that the following conditions are met: |
7 |
|
* |
46 |
|
// |
47 |
|
// Douglas Thrift |
48 |
|
// |
49 |
< |
// Indexer.cpp |
49 |
> |
// $Id: Indexer.cpp,v 1.12 2003/07/15 01:30:46 douglas Exp $ |
50 |
|
|
51 |
|
#include "Indexer.h" |
52 |
|
|
53 |
< |
Indexer::Indexer(string& indexFile, set<string>& domains, |
54 |
< |
set<string>& restrictions) |
53 |
> |
#ifndef _WIN32 |
54 |
> |
#include <unistd.h> |
55 |
> |
#else // _WIN32 |
56 |
> |
inline int unlink(const char* filename) { return DeleteFile(filename); } |
57 |
> |
#endif // _WIN32 |
58 |
> |
|
59 |
> |
Indexer::Indexer(string& indexFile, set<string>& domains, set<string>& |
60 |
> |
restrictions) |
61 |
|
{ |
62 |
|
this->indexFile = indexFile; |
63 |
|
this->domains = domains; |
66 |
|
|
67 |
|
void Indexer::index(string& begin) |
68 |
|
{ |
69 |
< |
ofstream fout(indexFile.c_str()); |
69 |
> |
unsigned separator = indexFile.rfind(slash); |
70 |
> |
string dtd = separator != string::npos ? indexFile.substr(0, separator) + |
71 |
> |
slash + "index.dtd" : "index.dtd"; |
72 |
> |
|
73 |
> |
ifstream fin(dtd.c_str()); |
74 |
> |
|
75 |
> |
if (!fin.is_open()) |
76 |
> |
{ |
77 |
> |
ofstream fout(dtd.c_str()); |
78 |
> |
|
79 |
> |
fout << "<!ELEMENT index (page*)>\n" |
80 |
> |
<< "<!ELEMENT page (address, port?, path, title?, description?, ke" |
81 |
> |
<< "ywords?, text,\n" |
82 |
> |
<< " heading*)\n" |
83 |
> |
<< ">\n" |
84 |
> |
<< "<!ELEMENT address (#PCDATA)>\n" |
85 |
> |
<< "<!ELEMENT port (#PCDATA)>\n" |
86 |
> |
<< "<!ELEMENT path (#PCDATA)>\n" |
87 |
> |
<< "<!ELEMENT size (#PCDATA)>\n" |
88 |
> |
<< "<!ELEMENT title (#PCDATA)>\n" |
89 |
> |
<< "<!ELEMENT description (#PCDATA)>\n" |
90 |
> |
<< "<!ELEMENT text (#PCDATA)>\n" |
91 |
> |
<< "<!ELEMENT heading (#PCDATA)>\n"; |
92 |
> |
|
93 |
> |
fout.close(); |
94 |
> |
} |
95 |
> |
|
96 |
> |
fin.close(); |
97 |
> |
|
98 |
> |
string lock = indexFile + ".lock"; |
99 |
> |
|
100 |
> |
ofstream fout(lock.c_str()); |
101 |
> |
fout.close(); |
102 |
> |
fout.open(indexFile.c_str()); |
103 |
|
|
104 |
|
fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>" |
105 |
|
<< "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n" |
112 |
|
fout << "</index>\n"; |
113 |
|
|
114 |
|
fout.close(); |
115 |
+ |
|
116 |
+ |
unlink(lock.c_str()); |
117 |
|
} |
118 |
|
|
119 |
< |
void Indexer::index(URL& url, ofstream& fout) |
119 |
> |
void Indexer::index(URL& url, ofstream& fout, const string referer) |
120 |
|
{ |
121 |
|
if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" + |
122 |
|
url.getPort() : "") != domains.end() && pages.find(url.getURL()) == |
130 |
|
|
131 |
|
if (!restricted(url)) |
132 |
|
{ |
133 |
< |
if (http.handle(url)) |
133 |
> |
if (http.handle(url, referer, true)) |
134 |
|
{ |
135 |
< |
if (http.good()) |
135 |
> |
if (http.contentType().find("text/plain") == 0 || |
136 |
> |
http.contentType().find("text/html") == 0) |
137 |
|
{ |
138 |
+ |
http.clear(); |
139 |
+ |
if (!http.handle(url, referer)) exit(1); |
140 |
+ |
|
141 |
|
cout << "Indexing " << url << "..." << flush; |
142 |
|
|
143 |
|
if (processor.process(http, url)) |
162 |
|
if (pages.find(*link) == pages.end()) |
163 |
|
{ |
164 |
|
links.push(URL(*link)); |
165 |
+ |
referers.push(url.getURL()); |
166 |
|
} |
167 |
|
} |
168 |
|
} |
169 |
+ |
else |
170 |
+ |
{ |
171 |
+ |
// unhandled content |
172 |
+ |
} |
173 |
+ |
} |
174 |
+ |
else if (http.redirect() != "") |
175 |
+ |
{ |
176 |
+ |
if (pages.find(http.redirect()) == pages.end()) |
177 |
+ |
{ |
178 |
+ |
links.push(URL(http.redirect())); |
179 |
+ |
referers.push(url.getURL()); |
180 |
+ |
} |
181 |
|
} |
182 |
|
|
183 |
|
http.clear(); |
189 |
|
URL next = links.front(); |
190 |
|
links.pop(); |
191 |
|
|
192 |
+ |
string referer = referers.front(); |
193 |
+ |
referers.pop(); |
194 |
+ |
|
195 |
|
if (debug) cerr << "next = " << next << "\n"; |
196 |
|
|
197 |
< |
index(next, fout); |
197 |
> |
index(next, fout, referer); |
198 |
|
} |
199 |
|
} |
200 |
|
|
231 |
|
cout << "Checking " << robots << "..." << flush; |
232 |
|
|
233 |
|
string line; |
173 |
– |
do http.getline(line); while (http.good() && line != ""); |
234 |
|
|
235 |
|
bool record = false, hasVersion = false, hasName = false, hasAll = |
236 |
|
false; |