trunk/Search/Indexer.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine Indexer
//
// Douglas Thrift
//
// $Id: Indexer.cpp,v 1.11 2003/07/11 07:54:46 douglas Exp $

#include "Indexer.h"

#ifndef _WIN32
#include <unistd.h>
#endif // _WIN32

Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
        restrictions)
{
        this->indexFile = indexFile;
        this->domains = domains;
        this->restrictions = restrictions;
}

void Indexer::index(string& begin)
{
        unsigned separator = indexFile.rfind(slash);
        string dtd = separator != string::npos ? indexFile.substr(0, separator) +
                slash + "index.dtd" : "index.dtd";

        ifstream fin(dtd.c_str());

        if (!fin.is_open())
        {
                ofstream fout(dtd.c_str());

                fout << "<!ELEMENT index (page*)>\n"
                        << "<!ELEMENT page (address, port?, path, title?, description?, ke"
                        << "ywords?, text,\n"
                        << "    heading*)\n"
                        << ">\n"
                        << "<!ELEMENT address (#PCDATA)>\n"
                        << "<!ELEMENT port (#PCDATA)>\n"
                        << "<!ELEMENT path (#PCDATA)>\n"
                        << "<!ELEMENT size (#PCDATA)>\n"
                        << "<!ELEMENT title (#PCDATA)>\n"
                        << "<!ELEMENT description (#PCDATA)>\n"
                        << "<!ELEMENT text (#PCDATA)>\n"
                        << "<!ELEMENT heading (#PCDATA)>\n";

                fout.close();
        }

        fin.close();

        string lock = indexFile + ".lock";

        ofstream fout(lock.c_str());
        fout.close();
        fout.open(indexFile.c_str());

        fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
                << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
                << "<index>\n";

        URL first(begin);

        index(first, fout);

        fout << "</index>\n";

        fout.close();

        unlink(lock.c_str());
}

void Indexer::index(URL& url, ofstream& fout, const string referer)
{
        if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
                url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
                pages.end())
        {
                if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
                        url.getPort() : "") == checked.end())
                {
                        robots(url);
                }

                if (!restricted(url))
                {
                        if (http.handle(url, referer, true))
                        {
                                if (http.contentType().find("text/plain") == 0 ||
                                        http.contentType().find("text/html") == 0)
                                {
                                        http.clear();
                                        if (!http.handle(url, referer)) exit(1);

                                        cout << "Indexing " << url << "..." << flush;

                                        if (processor.process(http, url))
                                        {
                                                Page page = processor.getPage();
                                                fout << page << "\n";

                                                cout << "done.\n";
                                        }
                                        else
                                        {
                                                cout << "canceled.\n";
                                        }

                                        pages.insert(url.getURL());
                                        Set pageLinks = processor.getLinks();
                                        processor.reset();

                                        for (SetIterator link = pageLinks.begin(); link !=
                                                pageLinks.end(); link++)
                                        {
                                                if (pages.find(*link) == pages.end())
                                                {
                                                        links.push(URL(*link));
                                                        referers.push(url.getURL());
                                                }
                                        }
                                }
                                else
                                {
                                        // unhandled content
                                }
                        }
                        else if (http.redirect() != "")
                        {
                                if (pages.find(http.redirect()) == pages.end())
                                {
                                        links.push(URL(http.redirect()));
                                        referers.push(url.getURL());
                                }
                        }

                        http.clear();
                }
        }

        if (!links.empty())
        {
                URL next = links.front();
                links.pop();

                string referer = referers.front();
                referers.pop();

                if (debug) cerr << "next = " << next << "\n";

                index(next, fout, referer);
        }
}

bool Indexer::restricted(URL& url)
{
        bool answer = false;

        for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
                itor++)
        {
                URL checker = *itor;

                if (url.getAddress() == checker.getAddress() && url.getPort() ==
                        checker.getPort())
                {
                        if (url.getPath().find(checker.getPath()) == 0)
                        {
                                answer = true;
                                break;
                        }
                }
        }

        return answer;
}

void Indexer::robots(URL& url)
{
        URL robots = url;
        robots.setPath("/robots.txt");

        if (http.handle(robots))
        {
                cout << "Checking " << robots << "..." << flush;

                string line;

                bool record = false, hasVersion = false, hasName = false, hasAll =
                        false;
                robot state = none;
                Set restrictionsVersion, restrictionsName, restrictionsAll;

                while (http.good())
                {
                        http.getline(line);

                        unsigned comment = line.find('#');
                        if (comment != string::npos) line.erase(comment);

                        if (line == "" && comment == string::npos) record = false;
                        if (line == "") continue;

                        unsigned colon = line.find(':');

                        string field = line.substr(0, colon);
                        string value = line.substr(colon + 1);

                        normalize(value);

                        if (field == "User-agent" && value == agent(true))
                        {
                                state = version;
                                record = true;
                                hasVersion = true;
                        }
                        else if (field == "User-agent" && value == agent(false))
                        {
                                state = name;
                                record = true;
                                hasName = true;
                        }
                        else if (field == "User-agent" && value == "*")
                        {
                                state = all;
                                record = true;
                                hasAll = true;
                        }
                        else if (field == "Disallow" && record && value == "")
                        {
                                // no restrictions
                        }
                        else if (field == "Disallow" && record)
                        {
                                URL restriction = robots;
                                restriction.setPath(value);

                                switch (state)
                                {
                                case version:
                                        restrictionsVersion.insert(restriction.getURL());
                                        break;
                                case name:
                                        restrictionsName.insert(restriction.getURL());
                                        break;
                                case all:
                                        restrictionsAll.insert(restriction.getURL());
                                        break;
                                default:
                                        break;
                                }
                        }
                }

                if (hasVersion)
                {
                        state = version;
                }
                else if (hasName)
                {
                        state = name;
                }
                else if (hasAll)
                {
                        state = all;
                }
                else
                {
                        state = none;
                }

                SetIterator itor;
                switch (state)
                {
                case version:
                        for (itor = restrictionsVersion.begin(); itor !=
                                restrictionsVersion.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                case name:
                        for (itor = restrictionsName.begin(); itor !=
                                restrictionsName.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                case all:
                        for (itor = restrictionsAll.begin(); itor !=
                                restrictionsAll.end(); itor++)
                        {
                                restrictions.insert(*itor);
                        }
                        break;
                default:
                        break;
                }

                cout << "done.\n";
        }

        http.clear();

        checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
                url.getPort() : "");
}
Revision:	194
Committed:	2003-07-11T00:54:47-07:00 (21 years, 11 months ago) by douglas
File size:	8467 byte(s)
Log Message:	Added Id tags to a bunch of files. $Id$!
#	Content
1	/* ============================================================================
2	* Douglas Thrift's Search Engine License
3	*
4	* Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions are met:
7	*
8	* 1. Redistributions of source code must retain the above copyright notice,
9	* this list of conditions and the following disclaimer.
10	*
11	* 2. Redistributions in binary form must reproduce the above copyright notice,
12	* this list of conditions and the following disclaimer in the documentation
13	* and/or other materials provided with the distribution.
14	*
15	* 3. The end-user documentation included with the redistribution, if any, must
16	* include the following acknowledgment:
17	*
18	* "This product includes software developed by Douglas Thrift
19	* (http://computers.douglasthrift.net/searchengine/)."
20	*
21	* Alternately, this acknowledgment may appear in the software itself, if
22	* and wherever such third-party acknowledgments normally appear.
23	*
24	* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25	* be used to endorse or promote products derived from this software without
26	* specific prior written permission. For written permission, please visit
27	* http://www.douglasthrift.net/contact.cgi for contact information.
28	*
29	* 5. Products derived from this software may not be called "Douglas Thrift's
30	* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31	* name, without prior written permission.
32	*
33	* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39	* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42	* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43	* ============================================================================
44	*/
45	// Douglas Thrift's Search Engine Indexer
46	//
47	// Douglas Thrift
48	//
49	// $Id: Indexer.cpp,v 1.11 2003/07/11 07:54:46 douglas Exp $
50
51	#include "Indexer.h"
52
53	#ifndef _WIN32
54	#include <unistd.h>
55	#endif // _WIN32
56
57	Indexer::Indexer(string& indexFile, set<string>& domains, set<string>&
58	restrictions)
59	{
60	this->indexFile = indexFile;
61	this->domains = domains;
62	this->restrictions = restrictions;
63	}
64
65	void Indexer::index(string& begin)
66	{
67	unsigned separator = indexFile.rfind(slash);
68	string dtd = separator != string::npos ? indexFile.substr(0, separator) +
69	slash + "index.dtd" : "index.dtd";
70
71	ifstream fin(dtd.c_str());
72
73	if (!fin.is_open())
74	{
75	ofstream fout(dtd.c_str());
76
77	fout << "<!ELEMENT index (page*)>\n"
78	<< "<!ELEMENT page (address, port?, path, title?, description?, ke"
79	<< "ywords?, text,\n"
80	<< " heading*)\n"
81	<< ">\n"
82	<< "<!ELEMENT address (#PCDATA)>\n"
83	<< "<!ELEMENT port (#PCDATA)>\n"
84	<< "<!ELEMENT path (#PCDATA)>\n"
85	<< "<!ELEMENT size (#PCDATA)>\n"
86	<< "<!ELEMENT title (#PCDATA)>\n"
87	<< "<!ELEMENT description (#PCDATA)>\n"
88	<< "<!ELEMENT text (#PCDATA)>\n"
89	<< "<!ELEMENT heading (#PCDATA)>\n";
90
91	fout.close();
92	}
93
94	fin.close();
95
96	string lock = indexFile + ".lock";
97
98	ofstream fout(lock.c_str());
99	fout.close();
100	fout.open(indexFile.c_str());
101
102	fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
103	<< "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
104	<< "<index>\n";
105
106	URL first(begin);
107
108	index(first, fout);
109
110	fout << "</index>\n";
111
112	fout.close();
113
114	unlink(lock.c_str());
115	}
116
117	void Indexer::index(URL& url, ofstream& fout, const string referer)
118	{
119	if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" +
120	url.getPort() : "") != domains.end() && pages.find(url.getURL()) ==
121	pages.end())
122	{
123	if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" +
124	url.getPort() : "") == checked.end())
125	{
126	robots(url);
127	}
128
129	if (!restricted(url))
130	{
131	if (http.handle(url, referer, true))
132	{
133	if (http.contentType().find("text/plain") == 0 \|\|
134	http.contentType().find("text/html") == 0)
135	{
136	http.clear();
137	if (!http.handle(url, referer)) exit(1);
138
139	cout << "Indexing " << url << "..." << flush;
140
141	if (processor.process(http, url))
142	{
143	Page page = processor.getPage();
144	fout << page << "\n";
145
146	cout << "done.\n";
147	}
148	else
149	{
150	cout << "canceled.\n";
151	}
152
153	pages.insert(url.getURL());
154	Set pageLinks = processor.getLinks();
155	processor.reset();
156
157	for (SetIterator link = pageLinks.begin(); link !=
158	pageLinks.end(); link++)
159	{
160	if (pages.find(*link) == pages.end())
161	{
162	links.push(URL(*link));
163	referers.push(url.getURL());
164	}
165	}
166	}
167	else
168	{
169	// unhandled content
170	}
171	}
172	else if (http.redirect() != "")
173	{
174	if (pages.find(http.redirect()) == pages.end())
175	{
176	links.push(URL(http.redirect()));
177	referers.push(url.getURL());
178	}
179	}
180
181	http.clear();
182	}
183	}
184
185	if (!links.empty())
186	{
187	URL next = links.front();
188	links.pop();
189
190	string referer = referers.front();
191	referers.pop();
192
193	if (debug) cerr << "next = " << next << "\n";
194
195	index(next, fout, referer);
196	}
197	}
198
199	bool Indexer::restricted(URL& url)
200	{
201	bool answer = false;
202
203	for (SetIterator itor = restrictions.begin(); itor != restrictions.end();
204	itor++)
205	{
206	URL checker = *itor;
207
208	if (url.getAddress() == checker.getAddress() && url.getPort() ==
209	checker.getPort())
210	{
211	if (url.getPath().find(checker.getPath()) == 0)
212	{
213	answer = true;
214	break;
215	}
216	}
217	}
218
219	return answer;
220	}
221
222	void Indexer::robots(URL& url)
223	{
224	URL robots = url;
225	robots.setPath("/robots.txt");
226
227	if (http.handle(robots))
228	{
229	cout << "Checking " << robots << "..." << flush;
230
231	string line;
232
233	bool record = false, hasVersion = false, hasName = false, hasAll =
234	false;
235	robot state = none;
236	Set restrictionsVersion, restrictionsName, restrictionsAll;
237
238	while (http.good())
239	{
240	http.getline(line);
241
242	unsigned comment = line.find('#');
243	if (comment != string::npos) line.erase(comment);
244
245	if (line == "" && comment == string::npos) record = false;
246	if (line == "") continue;
247
248	unsigned colon = line.find(':');
249
250	string field = line.substr(0, colon);
251	string value = line.substr(colon + 1);
252
253	normalize(value);
254
255	if (field == "User-agent" && value == agent(true))
256	{
257	state = version;
258	record = true;
259	hasVersion = true;
260	}
261	else if (field == "User-agent" && value == agent(false))
262	{
263	state = name;
264	record = true;
265	hasName = true;
266	}
267	else if (field == "User-agent" && value == "*")
268	{
269	state = all;
270	record = true;
271	hasAll = true;
272	}
273	else if (field == "Disallow" && record && value == "")
274	{
275	// no restrictions
276	}
277	else if (field == "Disallow" && record)
278	{
279	URL restriction = robots;
280	restriction.setPath(value);
281
282	switch (state)
283	{
284	case version:
285	restrictionsVersion.insert(restriction.getURL());
286	break;
287	case name:
288	restrictionsName.insert(restriction.getURL());
289	break;
290	case all:
291	restrictionsAll.insert(restriction.getURL());
292	break;
293	default:
294	break;
295	}
296	}
297	}
298
299	if (hasVersion)
300	{
301	state = version;
302	}
303	else if (hasName)
304	{
305	state = name;
306	}
307	else if (hasAll)
308	{
309	state = all;
310	}
311	else
312	{
313	state = none;
314	}
315
316	SetIterator itor;
317	switch (state)
318	{
319	case version:
320	for (itor = restrictionsVersion.begin(); itor !=
321	restrictionsVersion.end(); itor++)
322	{
323	restrictions.insert(*itor);
324	}
325	break;
326	case name:
327	for (itor = restrictionsName.begin(); itor !=
328	restrictionsName.end(); itor++)
329	{
330	restrictions.insert(*itor);
331	}
332	break;
333	case all:
334	for (itor = restrictionsAll.begin(); itor !=
335	restrictionsAll.end(); itor++)
336	{
337	restrictions.insert(*itor);
338	}
339	break;
340	default:
341	break;
342	}
343
344	cout << "done.\n";
345	}
346
347	http.clear();
348
349	checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
350	url.getPort() : "");
351	}