trunk/Search/Indexer.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine Indexer
//
// Douglas Thrift
//
// Indexer.cpp

#include "Indexer.h"

Indexer::Indexer(string& indexFile, set<string>& domains,
        set<string>& restrictions)
{
        this->indexFile = indexFile;
        this->domains = domains;
        this->restrictions = restrictions;
}

void Indexer::index(string& begin)
{
        unsigned separator = indexFile.rfind(slash);
        string dtd = separator != string::npos ? indexFile.substr(0, separator) +
                slash + "index.dtd" : "index.dtd";

        ifstream fin(dtd.c_str());

        if (!fin.is_open())
        {
                ofstream fout(dtd.c_str());

                fout << "<!ELEMENT index (page*)>\n"
                        << "<!ELEMENT page (address, port?, path, title?, description?, ke"
                        << "ywords?, text,\n"
                        << "    heading*)\n"
                        << ">\n"
                        << "<!ELEMENT address (#PCDATA)>\n"
                        << "<!ELEMENT port (#PCDATA)>\n"
                        << "<!ELEMENT path (#PCDATA)>\n"
                        << "<!ELEMENT size (#PCDATA)>\n"
                        << "<!ELEMENT title (#PCDATA)>\n"
                        << "<!ELEMENT description (#PCDATA)>\n"
                        << "<!ELEMENT text (#PCDATA)>\n"
                        << "<!ELEMENT heading (#PCDATA)>\n";

                fout.close();
        }

        fin.close();

        ofstream fout(indexFile.c_str());

        fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
                << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
                << "<index>\n";

        URL first(begin);

        index(first, fout);

        fout << "</index>\n";

        fout.close();
}

void Indexer::index(URL& url, ofstream& fout, const string referer)
{
        if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
                url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
                pages.end())
        {
                if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
                        url.getPort() : "") == checked.end())
                {
                        robots(url);
                }

                if (!restricted(url))
                {
                        if (http.handle(url, referer, true))
                        {
                                if (http.contentType().find("text/plain") == 0 ||
                                        http.contentType().find("text/html") == 0)
                                {
                                        http.clear();
                                        if (!http.handle(url, referer)) exit(1);

                                        cout << "Indexing " << url << "..." << flush;

                                        if (processor.process(http, url))
                                        {
                                                Page page = processor.getPage();
                                                fout << page << "\n";

                                                cout << "done.\n";
                                        }
                                        else
                                        {
                                                cout << "canceled.\n";
                                        }

                                        pages.insert(url.getURL());
                                        Set pageLinks = processor.getLinks();
                                        processor.reset();

                                        for (SetIterator link = pageLinks.begin(); link !=
                                                pageLinks.end(); link++)
                                        {
                                                if (pages.find(*link) == pages.end())
                                                {
                                                        links.push(URL(*link));
                                                        referers.push(url.getURL());
                                                }
                                        }
                                }
                                else
                                {
                                        // unhandled content
                                }
                        }
                        else if (http.redirect() != "")
                        {
                                if (pages.find(http.redirect()) == pages.end())
                                {
                                        links.push(URL(http.redirect()));
                                        referers.push(url.getURL());
                                }
                        }

                        http.clear();
                }
        }

        if (!links.empty())
        {
                URL next = links.front();
                links.pop();

                string referer = referers.front();
                referers.pop();

                if (debug) cerr << "next = " << next << "\n";

                index(next, fout, referer);
        }
}

bool Indexer::restricted(URL& url)
{
        bool answer = false;

        for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
                itor++)
        {
                URL checker = *itor;

                if (url.getAddress() == checker.getAddress() && url.getPort() ==
                        checker.getPort())
                {
                        if (url.getPath().find(checker.getPath()) == 0)
                        {
                                answer = true;
                                break;
                        }
                }
        }

        return answer;
}

void Indexer::robots(URL& url)
{
        URL robots = url;
        robots.setPath("/robots.txt");

        if (http.handle(robots))
        {
                cout << "Checking " << robots << "..." << flush;

                string line;

                bool record = false, hasVersion = false, hasName = false, hasAll =
                        false;
                robot state = none;
                Set restrictionsVersion, restrictionsName, restrictionsAll;

                while (http.good())
                {
                        http.getline(line);

                        unsigned comment = line.find('#');
                        if (comment != string::npos) line.erase(comment);

                        if (line == "" && comment == string::npos) record = false;
                        if (line == "") continue;

                        unsigned colon = line.find(':');

                        string field = line.substr(0, colon);
                        string value = line.substr(colon + 1);

                        normalize(value);

                        if (field == "User-agent" && value == agent(true))
                        {
                                state = version;
                                record = true;
                                hasVersion = true;
                        }
                        else if (field == "User-agent" && value == agent(false))
                        {
                                state = name;
                                record = true;
                                hasName = true;
                        }
                        else if (field == "User-agent" && value == "*")
                        {
                                state = all;
                                record = true;
                                hasAll = true;
                        }
                        else if (field == "Disallow" && record && value == "")
                        {
                                // no restrictions
                        }
                        else if (field == "Disallow" && record)
                        {
                                URL restriction = robots;
                                restriction.setPath(value);

                                switch (state)
                                {
                                case version:
                                        restrictionsVersion.insert(restriction.getURL());
                                        break;
                                case name:
                                        restrictionsName.insert(restriction.getURL());
                                        break;
                                case all:
                                        restrictionsAll.insert(restriction.getURL());
                                        break;
                                default:
                                        break;
                                }
                        }
                }

                if (hasVersion)
                {
                        state = version;
                }
                else if (hasName)
                {
                        state = name;
                }
                else if (hasAll)
                {
                        state = all;
                }
                else
                {
                        state = none;
                }

                SetIterator itor;
                switch (state)
                {
                case version:
                        for (itor = restrictionsVersion.begin(); itor !=
                                restrictionsVersion.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                case name:
                        for (itor = restrictionsName.begin(); itor !=
                                restrictionsName.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                case all:
                        for (itor = restrictionsAll.begin(); itor !=
                                restrictionsAll.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                default:
                        break;
                }

                cout << "done.\n";
        }

        http.clear();

        checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
                url.getPort() : "");
}
Revision:	35
Committed:	2003-01-16T17:27:03-08:00 (22 years, 5 months ago) by douglas
File size:	8266 byte(s)
Log Message:	Embedded DTD into the program and removed it from tree. Added usage() to bad argument handlers.
#	Content
1	/* ============================================================================
2	* Douglas Thrift's Search Engine License
3	*
4	* Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions are met:
7	*
8	* 1. Redistributions of source code must retain the above copyright notice,
9	* this list of conditions and the following disclaimer.
10	*
11	* 2. Redistributions in binary form must reproduce the above copyright notice,
12	* this list of conditions and the following disclaimer in the documentation
13	* and/or other materials provided with the distribution.
14	*
15	* 3. The end-user documentation included with the redistribution, if any, must
16	* include the following acknowledgment:
17	*
18	* "This product includes software developed by Douglas Thrift
19	* (http://computers.douglasthrift.net/searchengine/)."
20	*
21	* Alternately, this acknowledgment may appear in the software itself, if
22	* and wherever such third-party acknowledgments normally appear.
23	*
24	* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25	* be used to endorse or promote products derived from this software without
26	* specific prior written permission. For written permission, please visit
27	* http://www.douglasthrift.net/contact.cgi for contact information.
28	*
29	* 5. Products derived from this software may not be called "Douglas Thrift's
30	* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31	* name, without prior written permission.
32	*
33	* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39	* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42	* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43	* ============================================================================
44	*/
45	// Douglas Thrift's Search Engine Indexer
46	//
47	// Douglas Thrift
48	//
49	// Indexer.cpp
50
51	#include "Indexer.h"
52
53	Indexer::Indexer(string& indexFile, set<string>& domains,
54	set<string>& restrictions)
55	{
56	this->indexFile = indexFile;
57	this->domains = domains;
58	this->restrictions = restrictions;
59	}
60
61	void Indexer::index(string& begin)
62	{
63	unsigned separator = indexFile.rfind(slash);
64	string dtd = separator != string::npos ? indexFile.substr(0, separator) +
65	slash + "index.dtd" : "index.dtd";
66
67	ifstream fin(dtd.c_str());
68
69	if (!fin.is_open())
70	{
71	ofstream fout(dtd.c_str());
72
73	fout << "<!ELEMENT index (page*)>\n"
74	<< "<!ELEMENT page (address, port?, path, title?, description?, ke"
75	<< "ywords?, text,\n"
76	<< " heading*)\n"
77	<< ">\n"
78	<< "<!ELEMENT address (#PCDATA)>\n"
79	<< "<!ELEMENT port (#PCDATA)>\n"
80	<< "<!ELEMENT path (#PCDATA)>\n"
81	<< "<!ELEMENT size (#PCDATA)>\n"
82	<< "<!ELEMENT title (#PCDATA)>\n"
83	<< "<!ELEMENT description (#PCDATA)>\n"
84	<< "<!ELEMENT text (#PCDATA)>\n"
85	<< "<!ELEMENT heading (#PCDATA)>\n";
86
87	fout.close();
88	}
89
90	fin.close();
91
92	ofstream fout(indexFile.c_str());
93
94	fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
95	<< "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
96	<< "<index>\n";
97
98	URL first(begin);
99
100	index(first, fout);
101
102	fout << "</index>\n";
103
104	fout.close();
105	}
106
107	void Indexer::index(URL& url, ofstream& fout, const string referer)
108	{
109	if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
110	url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
111	pages.end())
112	{
113	if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
114	url.getPort() : "") == checked.end())
115	{
116	robots(url);
117	}
118
119	if (!restricted(url))
120	{
121	if (http.handle(url, referer, true))
122	{
123	if (http.contentType().find("text/plain") == 0 \|\|
124	http.contentType().find("text/html") == 0)
125	{
126	http.clear();
127	if (!http.handle(url, referer)) exit(1);
128
129	cout << "Indexing " << url << "..." << flush;
130
131	if (processor.process(http, url))
132	{
133	Page page = processor.getPage();
134	fout << page << "\n";
135
136	cout << "done.\n";
137	}
138	else
139	{
140	cout << "canceled.\n";
141	}
142
143	pages.insert(url.getURL());
144	Set pageLinks = processor.getLinks();
145	processor.reset();
146
147	for (SetIterator link = pageLinks.begin(); link !=
148	pageLinks.end(); link++)
149	{
150	if (pages.find(*link) == pages.end())
151	{
152	links.push(URL(*link));
153	referers.push(url.getURL());
154	}
155	}
156	}
157	else
158	{
159	// unhandled content
160	}
161	}
162	else if (http.redirect() != "")
163	{
164	if (pages.find(http.redirect()) == pages.end())
165	{
166	links.push(URL(http.redirect()));
167	referers.push(url.getURL());
168	}
169	}
170
171	http.clear();
172	}
173	}
174
175	if (!links.empty())
176	{
177	URL next = links.front();
178	links.pop();
179
180	string referer = referers.front();
181	referers.pop();
182
183	if (debug) cerr << "next = " << next << "\n";
184
185	index(next, fout, referer);
186	}
187	}
188
189	bool Indexer::restricted(URL& url)
190	{
191	bool answer = false;
192
193	for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
194	itor++)
195	{
196	URL checker = *itor;
197
198	if (url.getAddress() == checker.getAddress() && url.getPort() ==
199	checker.getPort())
200	{
201	if (url.getPath().find(checker.getPath()) == 0)
202	{
203	answer = true;
204	break;
205	}
206	}
207	}
208
209	return answer;
210	}
211
212	void Indexer::robots(URL& url)
213	{
214	URL robots = url;
215	robots.setPath("/robots.txt");
216
217	if (http.handle(robots))
218	{
219	cout << "Checking " << robots << "..." << flush;
220
221	string line;
222
223	bool record = false, hasVersion = false, hasName = false, hasAll =
224	false;
225	robot state = none;
226	Set restrictionsVersion, restrictionsName, restrictionsAll;
227
228	while (http.good())
229	{
230	http.getline(line);
231
232	unsigned comment = line.find('#');
233	if (comment != string::npos) line.erase(comment);
234
235	if (line == "" && comment == string::npos) record = false;
236	if (line == "") continue;
237
238	unsigned colon = line.find(':');
239
240	string field = line.substr(0, colon);
241	string value = line.substr(colon + 1);
242
243	normalize(value);
244
245	if (field == "User-agent" && value == agent(true))
246	{
247	state = version;
248	record = true;
249	hasVersion = true;
250	}
251	else if (field == "User-agent" && value == agent(false))
252	{
253	state = name;
254	record = true;
255	hasName = true;
256	}
257	else if (field == "User-agent" && value == "*")
258	{
259	state = all;
260	record = true;
261	hasAll = true;
262	}
263	else if (field == "Disallow" && record && value == "")
264	{
265	// no restrictions
266	}
267	else if (field == "Disallow" && record)
268	{
269	URL restriction = robots;
270	restriction.setPath(value);
271
272	switch (state)
273	{
274	case version:
275	restrictionsVersion.insert(restriction.getURL());
276	break;
277	case name:
278	restrictionsName.insert(restriction.getURL());
279	break;
280	case all:
281	restrictionsAll.insert(restriction.getURL());
282	break;
283	default:
284	break;
285	}
286	}
287	}
288
289	if (hasVersion)
290	{
291	state = version;
292	}
293	else if (hasName)
294	{
295	state = name;
296	}
297	else if (hasAll)
298	{
299	state = all;
300	}
301	else
302	{
303	state = none;
304	}
305
306	SetIterator itor;
307	switch (state)
308	{
309	case version:
310	for (itor = restrictionsVersion.begin(); itor !=
311	restrictionsVersion.end(); itor++)
312	{
313	restrictions.insert(*itor);
314	}
315	break;
316	case name:
317	for (itor = restrictionsName.begin(); itor !=
318	restrictionsName.end(); itor++)
319	{
320	restrictions.insert(*itor);
321	}
322	break;
323	case all:
324	for (itor = restrictionsAll.begin(); itor !=
325	restrictionsAll.end(); itor++)
326	{
327	restrictions.insert(*itor);
328	}
329	break;
330	default:
331	break;
332	}
333
334	cout << "done.\n";
335	}
336
337	http.clear();
338
339	checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
340	url.getPort() : "");
341	}