trunk/Search/Processor.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine Processor
//
// Douglas Thrift
//
// Processor.cpp

#include "Processor.h"

Processor::Processor()
{
        page = new Page();
}

Processor::~Processor()
{
        delete page;
}

bool Processor::process(HttpHandler& http, URL& url)
{
        string title, description, text;
        vector<string> headings;

        if (html(http))
        {
                if (!process(http, url, title, description, text, headings)) return
                        false;

                entities(title, "&nbsp;", ' ');
                entities(title, "&lt;", '<');
                entities(title, "&gt;", '>');
                entities(title, "&quot;", '\"');
                entities(title, "&amp;", '&');

                entities(description, "&nbsp;", ' ');
                entities(description, "&lt;", '<');
                entities(description, "&gt;", '>');
                entities(description, "&quot;", '\"');
                entities(description, "&amp;", '&');

                entities(text, "&nbsp;", ' ');
                entities(text, "&lt;", '<');
                entities(text, "&gt;", '>');
                entities(text, "&quot;", '\"');
                entities(text, "&amp;", '&');

                for (int index = 0; index < headings.size(); index++)
                {
                        entities(headings[index], "&nbsp;", ' ');
                        entities(headings[index], "&lt;", '<');
                        entities(headings[index], "&gt;", '>');
                        entities(headings[index], "&quot;", '\"');
                        entities(headings[index], "&amp;", '&');
                }

                normalize(title);
                normalize(description);
                normalize(text);
                for (int index0 = 0; index0 < headings.size(); index0++)
                {
                        normalize(headings[index0]);
                }
        }
        else
        {
                bool knowSize = page->getSize() > 0;

                string line;
                while (http.good())
                {
                        http.getline(line);

                        text += line + "\n";

                        if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
                }

                normalize(text);
        }

        page->setURL(url);
        page->setTitle(title);
        page->setDescription(description);
        page->setText(text);
        page->setHeadings(headings);

        return true;
}

void Processor::reset()
{
        links.clear();
        delete page;
        page = new Page();
}

bool Processor::process(HttpHandler& http, URL& url, string& title, string&
        description, string& text, vector<string>& headings)
{
        bool inHtml = false, inHead = false, inTitle = false, inBody = false,
                inHeading = false, inComment = false, knowSize = page->getSize() > 0,
                follow = true, answer = true;
        unsigned startComment = 0, finishComment = 0;
        string line;
        while (http.good())
        {
                http.getline(line);
                string heading;

                unsigned begin = 0;
                while (begin < line.length())
                {
                        unsigned open = line.find('<', begin);
                        unsigned close = line.find('>', begin);

                        string next;
                        while (close == string::npos)
                        {
                                http.getline(next);
                                line += '\n' + next;
                                close = line.find('>', begin);
                        }

                        // strangely this is necessary sometimes
                        if (open == string::npos) open = line.find('<', begin);

                        string between = line.substr(begin, open - begin);
                        string tag = getTag(line, open, close);
                        string lowerTag(tag.length(), ' ');

                        for (unsigned index = 0; index < tag.length(); index++)
                        {
                                lowerTag[index] = tolower(tag[index]);
                        }

                        if (inHtml && !inComment)
                        {
                                if (inHead && inTitle)
                                {
                                        title += between + "\n";
                                }

                                if (inBody)
                                {
                                        text += between + "\n";
                                }

                                if (inBody && inHeading)
                                {
                                        heading += between + "\n";
                                }
                                if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
                                        == 0) || (lowerTag.find("meta   ") == 0)) && inHead)
                                {
                                        if (lowerTag.find("name=robots") != string::npos ||
                                                lowerTag.find("name=\"robots\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("content=\"") + 9;
                                                unsigned finish = lowerTag.find('\"', start);

                                                string robots = lowerTag.substr(start, finish - start);

                                                if ((robots.find("noindex") != string::npos &&
                                                        robots.find("nofollow") != string::npos) ||
                                                        robots.find("none") != string::npos)
                                                {
                                                        answer = false;
                                                        follow = false;
                                                        links.clear();

                                                        return answer;
                                                }
                                                else if (robots.find("noindex") != string::npos)
                                                {
                                                        answer = false;
                                                }
                                                else if (robots.find("nofollow") != string::npos)
                                                {
                                                        follow = false;
                                                        links.clear();
                                                }
                                        }
                                        else if (lowerTag.find("name=description") != string::npos
                                                || lowerTag.find("name=\"description\"") !=
                                                string::npos)
                                        {
                                                unsigned start = lowerTag.find("content=\"") + 9;
                                                unsigned finish = lowerTag.find('\"', start);

                                                description = tag.substr(start, finish - start);
                                        }
                                }

                                if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
                                        || (lowerTag.find("a    ") == 0)) && inBody && follow)
                                {
                                        if (lowerTag.find("href=\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("href=\"") + 6;
                                                unsigned finish = lowerTag.find('\"', start);

                                                string link = getLink(tag.substr(start, finish -
                                                        start), url);

                                                if (link != "bad link") links.insert(link);
                                        }
                                        else if (lowerTag.find("href=") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("href=") + 5;
                                                unsigned finish = lowerTag.find(' ', start);

                                                if (finish < close)
                                                {
                                                        string link = getLink(tag.substr(start, finish -
                                                                start), url);

                                                        if (link != "bad link") links.insert(link);
                                                }
                                                else
                                                {
                                                        string link = getLink(tag.substr(start, close -
                                                                start), url);

                                                        if (link != "bad link") links.insert(link);
                                                }
                                        }
                                }

                                if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
                                        0) || (lowerTag.find("img       ")) && inBody)
                                {
                                        if (lowerTag.find("alt=\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("alt=\"") + 5;
                                                unsigned finish = lowerTag.find('\"', start);

                                                text += tag.substr(start, finish - start) + ' ';
                                                if (inHeading) heading += tag.substr(start, finish -
                                                        start) + ' ';
                                        }
                                        else if (lowerTag.find("alt=") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("alt=") + 4;
                                                unsigned finish = lowerTag.find(' ', start);

                                                if (finish < close)
                                                {
                                                        text += tag.substr(start, finish - start) + ' ';
                                                        if (inHeading) heading += tag.substr(start, finish
                                                                - start) + ' ';
                                                }
                                                else
                                                {
                                                        text += tag.substr(start, close - start) + ' ';
                                                        if (inHeading) heading += tag.substr(start, close -
                                                                start) + ' ';
                                                }
                                        }
                                }
                        }

                        if (lowerTag.find("html") == 0) inHtml = true;
                        if (lowerTag.find("/html") == 0) inHtml = false;

                        if (lowerTag.find("head") == 0) inHead = true;
                        if (lowerTag.find("/head") == 0) inHead = false;

                        if (lowerTag.find("title") == 0) inTitle = true;
                        if (lowerTag.find("/title") == 0) inTitle = false;

                        if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
                                inBody = true;
                        if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
                                inBody = false;

                        if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
                                lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
                                lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
                                inHeading = true;
                        if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
                                lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
                                lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
                        {
                                if (heading != "") headings.push_back(heading);
                                inHeading = false;
                        }

                        if (lowerTag.find("!--") == 0)
                        {
                                startComment = open;
                                inComment = true;
                        }
                        if (line.find("-->", begin) >= startComment && line.find("-->",
                                begin) != string::npos)
                        {
                                finishComment = line.find("-->", begin) + 3;
                                inComment = false;
                        }

                        if (close == string::npos)
                        {
                                begin = close;
                        }
                        else
                        {
                                begin = close + 1;
                        }
                }

                startComment = 0;
                finishComment = 0;

                if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
        }

        return answer;
}

bool Processor::html(HttpHandler& http)
{
        bool answer = false;

        string line;
        http.getline(line);

        while (http.good())
        {
                string field;
                http.getline(field, ' ');
                if (field == "") break;
                http.getline(line);

                if (field == "Content-Type:" || field == "Content-type:")
                {
                        if (line.find("text/html") != string::npos)
                        {
                                answer = true;
                        }
                }

                if (field == "Content-Length:" || field == "Content-length:")
                {
                        page->setSize(strtoul(line.c_str(), 0, 0));
                }
        }

        return answer;
}

string Processor::getTag(const string& line, unsigned open, unsigned close)
{
        string tag = line.substr(open + 1, close - open - 1);

        return tag;
}

string Processor::getLink(string link, URL& url)
{
        string hyperlink = "bad link";

        if (link.find('#') != string::npos)
        {
                unsigned pound = link.find('#');
                link.erase(pound);
        }

        if (link.find("://") != string::npos)
        {
                if (link.find("http://") == 0) hyperlink = link;
        }
        else if (link.find("mailto:") == 0)
        {
                // do nothing we are not evil spammers!
        }
        else if (link.find("//") == 0)
        {
                hyperlink = "http:" + link;
        }
        else if (link.find('/') == 0)
        {
                hyperlink = url.getURL();

                unsigned path = hyperlink.find('/', 7);
                hyperlink.erase(path);

                hyperlink += link;
        }
        else if (link == "")
        {
                // a blank link is useless
        }
        else
        {
                hyperlink = url.getURL();
                string path = url.getPath();

                unsigned cutoff = hyperlink.rfind(path);
                hyperlink.erase(cutoff);

                unsigned dir = path.rfind('/') + 1;
                path.erase(dir);

                while (link.find("../") == 0)
                {
                        unsigned dot = path.rfind('/') - 1;
                        unsigned up = path.rfind('/', dot) + 1;

                        path.erase(up);
                        link.erase(0, 3);
                }
                while (link.find("./") == 0)
                {
                        link.erase(0, 2);
                }

                hyperlink += path + link;
        }

        return hyperlink;
}
Revision:	15
Committed:	2002-12-09T09:46:18-08:00 (22 years, 6 months ago) by douglas
File size:	11976 byte(s)
Log Message:	Figured out and fixed fred problems.
#	Content
1	/* ============================================================================
2	* Douglas Thrift's Search Engine License
3	*
4	* Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions are met:
7	*
8	* 1. Redistributions of source code must retain the above copyright notice,
9	* this list of conditions and the following disclaimer.
10	*
11	* 2. Redistributions in binary form must reproduce the above copyright notice,
12	* this list of conditions and the following disclaimer in the documentation
13	* and/or other materials provided with the distribution.
14	*
15	* 3. The end-user documentation included with the redistribution, if any, must
16	* include the following acknowledgment:
17	*
18	* "This product includes software developed by Douglas Thrift
19	* (http://computers.douglasthrift.net/searchengine/)."
20	*
21	* Alternately, this acknowledgment may appear in the software itself, if
22	* and wherever such third-party acknowledgments normally appear.
23	*
24	* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25	* be used to endorse or promote products derived from this software without
26	* specific prior written permission. For written permission, please visit
27	* http://www.douglasthrift.net/contact.cgi for contact information.
28	*
29	* 5. Products derived from this software may not be called "Douglas Thrift's
30	* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31	* name, without prior written permission.
32	*
33	* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39	* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42	* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43	* ============================================================================
44	*/
45	// Douglas Thrift's Search Engine Processor
46	//
47	// Douglas Thrift
48	//
49	// Processor.cpp
50
51	#include "Processor.h"
52
53	Processor::Processor()
54	{
55	page = new Page();
56	}
57
58	Processor::~Processor()
59	{
60	delete page;
61	}
62
63	bool Processor::process(HttpHandler& http, URL& url)
64	{
65	string title, description, text;
66	vector<string> headings;
67
68	if (html(http))
69	{
70	if (!process(http, url, title, description, text, headings)) return
71	false;
72
73	entities(title, " ", ' ');
74	entities(title, "<", '<');
75	entities(title, ">", '>');
76	entities(title, """, '\"');
77	entities(title, "&", '&');
78
79	entities(description, " ", ' ');
80	entities(description, "<", '<');
81	entities(description, ">", '>');
82	entities(description, """, '\"');
83	entities(description, "&", '&');
84
85	entities(text, " ", ' ');
86	entities(text, "<", '<');
87	entities(text, ">", '>');
88	entities(text, """, '\"');
89	entities(text, "&", '&');
90
91	for (int index = 0; index < headings.size(); index++)
92	{
93	entities(headings[index], " ", ' ');
94	entities(headings[index], "<", '<');
95	entities(headings[index], ">", '>');
96	entities(headings[index], """, '\"');
97	entities(headings[index], "&", '&');
98	}
99
100	normalize(title);
101	normalize(description);
102	normalize(text);
103	for (int index0 = 0; index0 < headings.size(); index0++)
104	{
105	normalize(headings[index0]);
106	}
107	}
108	else
109	{
110	bool knowSize = page->getSize() > 0;
111
112	string line;
113	while (http.good())
114	{
115	http.getline(line);
116
117	text += line + "\n";
118
119	if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
120	}
121
122	normalize(text);
123	}
124
125	page->setURL(url);
126	page->setTitle(title);
127	page->setDescription(description);
128	page->setText(text);
129	page->setHeadings(headings);
130
131	return true;
132	}
133
134	void Processor::reset()
135	{
136	links.clear();
137	delete page;
138	page = new Page();
139	}
140
141	bool Processor::process(HttpHandler& http, URL& url, string& title, string&
142	description, string& text, vector<string>& headings)
143	{
144	bool inHtml = false, inHead = false, inTitle = false, inBody = false,
145	inHeading = false, inComment = false, knowSize = page->getSize() > 0,
146	follow = true, answer = true;
147	unsigned startComment = 0, finishComment = 0;
148	string line;
149	while (http.good())
150	{
151	http.getline(line);
152	string heading;
153
154	unsigned begin = 0;
155	while (begin < line.length())
156	{
157	unsigned open = line.find('<', begin);
158	unsigned close = line.find('>', begin);
159
160	string next;
161	while (close == string::npos)
162	{
163	http.getline(next);
164	line += '\n' + next;
165	close = line.find('>', begin);
166	}
167
168	// strangely this is necessary sometimes
169	if (open == string::npos) open = line.find('<', begin);
170
171	string between = line.substr(begin, open - begin);
172	string tag = getTag(line, open, close);
173	string lowerTag(tag.length(), ' ');
174
175	for (unsigned index = 0; index < tag.length(); index++)
176	{
177	lowerTag[index] = tolower(tag[index]);
178	}
179
180	if (inHtml && !inComment)
181	{
182	if (inHead && inTitle)
183	{
184	title += between + "\n";
185	}
186
187	if (inBody)
188	{
189	text += between + "\n";
190	}
191
192	if (inBody && inHeading)
193	{
194	heading += between + "\n";
195	}
196	if (((lowerTag.find("meta ") == 0) \|\| (lowerTag.find("meta\n")
197	== 0) \|\| (lowerTag.find("meta ") == 0)) && inHead)
198	{
199	if (lowerTag.find("name=robots") != string::npos \|\|
200	lowerTag.find("name=\"robots\"") != string::npos)
201	{
202	unsigned start = lowerTag.find("content=\"") + 9;
203	unsigned finish = lowerTag.find('\"', start);
204
205	string robots = lowerTag.substr(start, finish - start);
206
207	if ((robots.find("noindex") != string::npos &&
208	robots.find("nofollow") != string::npos) \|\|
209	robots.find("none") != string::npos)
210	{
211	answer = false;
212	follow = false;
213	links.clear();
214
215	return answer;
216	}
217	else if (robots.find("noindex") != string::npos)
218	{
219	answer = false;
220	}
221	else if (robots.find("nofollow") != string::npos)
222	{
223	follow = false;
224	links.clear();
225	}
226	}
227	else if (lowerTag.find("name=description") != string::npos
228	\|\| lowerTag.find("name=\"description\"") !=
229	string::npos)
230	{
231	unsigned start = lowerTag.find("content=\"") + 9;
232	unsigned finish = lowerTag.find('\"', start);
233
234	description = tag.substr(start, finish - start);
235	}
236	}
237
238	if (((lowerTag.find("a ") == 0) \|\| (lowerTag.find("a\n") == 0)
239	\|\| (lowerTag.find("a ") == 0)) && inBody && follow)
240	{
241	if (lowerTag.find("href=\"") != string::npos)
242	{
243	unsigned start = lowerTag.find("href=\"") + 6;
244	unsigned finish = lowerTag.find('\"', start);
245
246	string link = getLink(tag.substr(start, finish -
247	start), url);
248
249	if (link != "bad link") links.insert(link);
250	}
251	else if (lowerTag.find("href=") != string::npos)
252	{
253	unsigned start = lowerTag.find("href=") + 5;
254	unsigned finish = lowerTag.find(' ', start);
255
256	if (finish < close)
257	{
258	string link = getLink(tag.substr(start, finish -
259	start), url);
260
261	if (link != "bad link") links.insert(link);
262	}
263	else
264	{
265	string link = getLink(tag.substr(start, close -
266	start), url);
267
268	if (link != "bad link") links.insert(link);
269	}
270	}
271	}
272
273	if ((lowerTag.find("img ") == 0) \|\| (lowerTag.find("img\n") ==
274	0) \|\| (lowerTag.find("img ")) && inBody)
275	{
276	if (lowerTag.find("alt=\"") != string::npos)
277	{
278	unsigned start = lowerTag.find("alt=\"") + 5;
279	unsigned finish = lowerTag.find('\"', start);
280
281	text += tag.substr(start, finish - start) + ' ';
282	if (inHeading) heading += tag.substr(start, finish -
283	start) + ' ';
284	}
285	else if (lowerTag.find("alt=") != string::npos)
286	{
287	unsigned start = lowerTag.find("alt=") + 4;
288	unsigned finish = lowerTag.find(' ', start);
289
290	if (finish < close)
291	{
292	text += tag.substr(start, finish - start) + ' ';
293	if (inHeading) heading += tag.substr(start, finish
294	- start) + ' ';
295	}
296	else
297	{
298	text += tag.substr(start, close - start) + ' ';
299	if (inHeading) heading += tag.substr(start, close -
300	start) + ' ';
301	}
302	}
303	}
304	}
305
306	if (lowerTag.find("html") == 0) inHtml = true;
307	if (lowerTag.find("/html") == 0) inHtml = false;
308
309	if (lowerTag.find("head") == 0) inHead = true;
310	if (lowerTag.find("/head") == 0) inHead = false;
311
312	if (lowerTag.find("title") == 0) inTitle = true;
313	if (lowerTag.find("/title") == 0) inTitle = false;
314
315	if (lowerTag.find("body") == 0 \|\| lowerTag.find("noframes") == 0)
316	inBody = true;
317	if (lowerTag.find("/body") == 0 \|\| lowerTag.find("/noframes") == 0)
318	inBody = false;
319
320	if (lowerTag.find("h1") == 0 \|\| lowerTag.find("h2") == 0 \|\|
321	lowerTag.find("h3") == 0 \|\| lowerTag.find("h4") == 0 \|\|
322	lowerTag.find("h5") == 0 \|\| lowerTag.find("h6") == 0)
323	inHeading = true;
324	if (lowerTag.find("/h1") == 0 \|\| lowerTag.find("/h2") == 0 \|\|
325	lowerTag.find("/h3") == 0 \|\| lowerTag.find("/h4") == 0 \|\|
326	lowerTag.find("/h5") == 0 \|\| lowerTag.find("/h6") == 0)
327	{
328	if (heading != "") headings.push_back(heading);
329	inHeading = false;
330	}
331
332	if (lowerTag.find("!--") == 0)
333	{
334	startComment = open;
335	inComment = true;
336	}
337	if (line.find("-->", begin) >= startComment && line.find("-->",
338	begin) != string::npos)
339	{
340	finishComment = line.find("-->", begin) + 3;
341	inComment = false;
342	}
343
344	if (close == string::npos)
345	{
346	begin = close;
347	}
348	else
349	{
350	begin = close + 1;
351	}
352	}
353
354	startComment = 0;
355	finishComment = 0;
356
357	if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
358	}
359
360	return answer;
361	}
362
363	bool Processor::html(HttpHandler& http)
364	{
365	bool answer = false;
366
367	string line;
368	http.getline(line);
369
370	while (http.good())
371	{
372	string field;
373	http.getline(field, ' ');
374	if (field == "") break;
375	http.getline(line);
376
377	if (field == "Content-Type:" \|\| field == "Content-type:")
378	{
379	if (line.find("text/html") != string::npos)
380	{
381	answer = true;
382	}
383	}
384
385	if (field == "Content-Length:" \|\| field == "Content-length:")
386	{
387	page->setSize(strtoul(line.c_str(), 0, 0));
388	}
389	}
390
391	return answer;
392	}
393
394	string Processor::getTag(const string& line, unsigned open, unsigned close)
395	{
396	string tag = line.substr(open + 1, close - open - 1);
397
398	return tag;
399	}
400
401	string Processor::getLink(string link, URL& url)
402	{
403	string hyperlink = "bad link";
404
405	if (link.find('#') != string::npos)
406	{
407	unsigned pound = link.find('#');
408	link.erase(pound);
409	}
410
411	if (link.find("://") != string::npos)
412	{
413	if (link.find("http://") == 0) hyperlink = link;
414	}
415	else if (link.find("mailto:") == 0)
416	{
417	// do nothing we are not evil spammers!
418	}
419	else if (link.find("//") == 0)
420	{
421	hyperlink = "http:" + link;
422	}
423	else if (link.find('/') == 0)
424	{
425	hyperlink = url.getURL();
426
427	unsigned path = hyperlink.find('/', 7);
428	hyperlink.erase(path);
429
430	hyperlink += link;
431	}
432	else if (link == "")
433	{
434	// a blank link is useless
435	}
436	else
437	{
438	hyperlink = url.getURL();
439	string path = url.getPath();
440
441	unsigned cutoff = hyperlink.rfind(path);
442	hyperlink.erase(cutoff);
443
444	unsigned dir = path.rfind('/') + 1;
445	path.erase(dir);
446
447	while (link.find("../") == 0)
448	{
449	unsigned dot = path.rfind('/') - 1;
450	unsigned up = path.rfind('/', dot) + 1;
451
452	path.erase(up);
453	link.erase(0, 3);
454	}
455	while (link.find("./") == 0)
456	{
457	link.erase(0, 2);
458	}
459
460	hyperlink += path + link;
461	}
462
463	return hyperlink;
464	}