trunk/Search/Processor.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine Processor
//
// Douglas Thrift
//
// Processor.cpp

#include "Processor.h"

Processor::Processor()
{
        page = new Page();
}

Processor::~Processor()
{
        delete page;
}

bool Processor::process(HttpHandler& http, URL& url)
{
        string title, description, text;
        vector<string> headings;

        if (html(http))
        {
                if (!process(http, url, title, description, text, headings)) return
                        false;

                entities(title, "&nbsp;", ' ');
                entities(title, "&lt;", '<');
                entities(title, "&gt;", '>');
                entities(title, "&quot;", '\"');
                entities(title, "&amp;", '&');

                entities(description, "&nbsp;", ' ');
                entities(description, "&lt;", '<');
                entities(description, "&gt;", '>');
                entities(description, "&quot;", '\"');
                entities(description, "&amp;", '&');

                entities(text, "&nbsp;", ' ');
                entities(text, "&lt;", '<');
                entities(text, "&gt;", '>');
                entities(text, "&quot;", '\"');
                entities(text, "&amp;", '&');

                for (int index = 0; index < headings.size(); index++)
                {
                        entities(headings[index], "&nbsp;", ' ');
                        entities(headings[index], "&lt;", '<');
                        entities(headings[index], "&gt;", '>');
                        entities(headings[index], "&quot;", '\"');
                        entities(headings[index], "&amp;", '&');
                }

                normalize(title);
                normalize(description);
                normalize(text);
                for (int index0 = 0; index0 < headings.size(); index0++)
                {
                        normalize(headings[index0]);
                }
        }
        else
        {
                bool knowSize = page->getSize() > 0;

                string line;
                while (http.good())
                {
                        http.getline(line);

                        text += line + "\n";

                        if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
                }

                normalize(text);
        }

        page->setURL(url);
        page->setTitle(title);
        page->setDescription(description);
        page->setText(text);
        page->setHeadings(headings);

        return true;
}

void Processor::reset()
{
        links.clear();
        delete page;
        page = new Page();
}

bool Processor::process(HttpHandler& http, URL& url, string& title, string&
        description, string& text, vector<string>& headings)
{
        bool inHtml = false, inHead = false, inTitle = false, inBody = false,
                inHeading = false, inComment = false, knowSize = page->getSize() > 0,
                follow = true, answer = true;
        unsigned startComment = 0, finishComment = 0;
        string line;
        while (http.good())
        {
                http.getline(line);
                string heading;

                unsigned begin = 0;
                while (begin < line.length())
                {
                        unsigned open = line.find('<', begin);
                        unsigned close = line.find('>', begin);

                        string next;
                        while (close == string::npos)
                        {
                                http.getline(next);
                                line += '\n' + next;
                                close = line.find('>', begin);
                        }

                        // strangely this is necessary sometimes
                        if (open == string::npos) open = line.find('<', begin);

                        string between = line.substr(begin, open - begin);
                        string tag = getTag(line, open, close);
                        string lowerTag(tag.length(), ' ');

                        for (unsigned index = 0; index < tag.length(); index++)
                        {
                                lowerTag[index] = tolower(tag[index]);
                        }

                        if (inHtml && !inComment)
                        {
                                if (inHead && inTitle)
                                {
                                        title += between + "\n";
                                }

                                if (inBody)
                                {
                                        text += between + "\n";
                                }

                                if (inBody && inHeading)
                                {
                                        heading += between + "\n";
                                }
                                if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n")
                                        == 0) || (lowerTag.find("meta   ") == 0)) && inHead)
                                {
                                        if (lowerTag.find("name=robots") != string::npos ||
                                                lowerTag.find("name=\"robots\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("content=\"") + 9;
                                                unsigned finish = lowerTag.find('\"', start);

                                                string robots = lowerTag.substr(start, finish - start);

                                                if ((robots.find("noindex") != string::npos &&
                                                        robots.find("nofollow") != string::npos) ||
                                                        robots.find("none") != string::npos)
                                                {
                                                        answer = false;
                                                        follow = false;
                                                        links.clear();

                                                        return answer;
                                                }
                                                else if (robots.find("noindex") != string::npos)
                                                {
                                                        answer = false;
                                                }
                                                else if (robots.find("nofollow") != string::npos)
                                                {
                                                        follow = false;
                                                        links.clear();
                                                }
                                        }
                                        else if (lowerTag.find("name=description") != string::npos
                                                || lowerTag.find("name=\"description\"") !=
                                                string::npos)
                                        {
                                                unsigned start = lowerTag.find("content=\"") + 9;
                                                unsigned finish = lowerTag.find('\"', start);

                                                description = tag.substr(start, finish - start);
                                        }
                                }

                                if (((lowerTag.find("a ") == 0) || (lowerTag.find("a\n") == 0)
                                        || (lowerTag.find("a    ") == 0)) && inBody && follow)
                                {
                                        if (lowerTag.find("href=\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("href=\"") + 6;
                                                unsigned finish = lowerTag.find('\"', start);

                                                string link = getLink(tag.substr(start, finish -
                                                        start), url);

                                                if (link != "bad link") links.insert(link);
                                        }
                                        else if (lowerTag.find("href=") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("href=") + 5;
                                                unsigned finish = lowerTag.find(' ', start);

                                                if (finish < close)
                                                {
                                                        string link = getLink(tag.substr(start, finish -
                                                                start), url);

                                                        if (link != "bad link") links.insert(link);
                                                }
                                                else
                                                {
                                                        string link = getLink(tag.substr(start, close -
                                                                start), url);

                                                        if (link != "bad link") links.insert(link);
                                                }
                                        }
                                }

                                if ((lowerTag.find("img ") == 0) || (lowerTag.find("img\n") ==
                                        0) || (lowerTag.find("img       ")) && inBody)
                                {
                                        if (lowerTag.find("alt=\"") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("alt=\"") + 5;
                                                unsigned finish = lowerTag.find('\"', start);

                                                text += tag.substr(start, finish - start) + ' ';
                                                if (inHeading) heading += tag.substr(start, finish -
                                                        start) + ' ';
                                        }
                                        else if (lowerTag.find("alt=") != string::npos)
                                        {
                                                unsigned start = lowerTag.find("alt=") + 4;
                                                unsigned finish = lowerTag.find(' ', start);

                                                if (finish < close)
                                                {
                                                        text += tag.substr(start, finish - start) + ' ';
                                                        if (inHeading) heading += tag.substr(start, finish
                                                                - start) + ' ';
                                                }
                                                else
                                                {
                                                        text += tag.substr(start, close - start) + ' ';
                                                        if (inHeading) heading += tag.substr(start, close -
                                                                start) + ' ';
                                                }
                                        }
                                }
                        }

                        if (lowerTag.find("html") == 0) inHtml = true;
                        if (lowerTag.find("/html") == 0) inHtml = false;

                        if (lowerTag.find("head") == 0) inHead = true;
                        if (lowerTag.find("/head") == 0) inHead = false;

                        if (lowerTag.find("title") == 0) inTitle = true;
                        if (lowerTag.find("/title") == 0) inTitle = false;

                        if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0)
                                inBody = true;
                        if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0)
                                inBody = false;

                        if (lowerTag.find("h1") == 0 || lowerTag.find("h2") == 0 ||
                                lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 ||
                                lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0)
                                inHeading = true;
                        if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 ||
                                lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 ||
                                lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0)
                        {
                                if (heading != "") headings.push_back(heading);
                                inHeading = false;
                        }

                        if (lowerTag.find("!--") == 0)
                        {
                                startComment = open;
                                inComment = true;
                        }
                        if (line.find("-->", begin) >= startComment && line.find("-->",
                                begin) != string::npos)
                        {
                                finishComment = line.find("-->", begin) + 3;
                                inComment = false;
                        }

                        if (close == string::npos)
                        {
                                begin = close;
                        }
                        else
                        {
                                begin = close + 1;
                        }
                }

                startComment = 0;
                finishComment = 0;

                if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
        }

        return answer;
}

bool Processor::html(HttpHandler& http)
{
        bool answer = false;

        string line;
        http.getline(line);

        while (http.good())
        {
                string field;
                http.getline(field, ' ');
                if (field == "") break;
                http.getline(line);

                if (field == "Content-Type:" || field == "Content-type:")
                {
                        if (line.find("text/html") != string::npos)
                        {
                                answer = true;
                        }
                }

                if (field == "Content-Length:" || field == "Content-length:")
                {
                        page->setSize(strtoul(line.c_str(), 0, 0));
                }
        }

        return answer;
}

string Processor::getTag(const string& line, unsigned open, unsigned close)
{
        string tag = line.substr(open + 1, close - open - 1);

        return tag;
}

string Processor::getLink(string link, URL& url)
{
        string hyperlink = "bad link";

        if (link.find('#') != string::npos)
        {
                unsigned pound = link.find('#');
                link.erase(pound);
        }

        if (link.find("://") != string::npos)
        {
                if (link.find("http://") == 0) hyperlink = link;
        }
        else if (link.find("mailto:") == 0)
        {
                // do nothing we are not evil spammers!
        }
        else if (link.find("//") == 0)
        {
                hyperlink = "http:" + link;
        }
        else if (link.find('/') == 0)
        {
                hyperlink = url.getURL();

                unsigned path = hyperlink.find('/', 7);
                hyperlink.erase(path);

                hyperlink += link;
        }
        else if (link == "")
        {
                // a blank link is useless
        }
        else
        {
                hyperlink = url.getURL();
                string path = url.getPath();

                unsigned cutoff = hyperlink.rfind(path);
                hyperlink.erase(cutoff);

                unsigned dir = path.rfind('/') + 1;
                path.erase(dir);

                while (link.find("../") == 0)
                {
                        unsigned dot = path.rfind('/') - 1;
                        unsigned up = path.rfind('/', dot) + 1;

                        path.erase(up);
                        link.erase(0, 3);
                }
                while (link.find("./") == 0)
                {
                        link.erase(0, 2);
                }

                hyperlink += path + link;
        }

        return hyperlink;
}
Revision:	15
Committed:	2002-12-09T09:46:18-08:00 (22 years, 6 months ago) by douglas
File size:	11976 byte(s)
Log Message:	Figured out and fixed fred problems.
#	User	Rev	Content
1	douglas	1	/* ============================================================================
2			* Douglas Thrift's Search Engine License
3			*
4			* Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5			* Redistribution and use in source and binary forms, with or without
6			* modification, are permitted provided that the following conditions are met:
7			*
8			* 1. Redistributions of source code must retain the above copyright notice,
9			* this list of conditions and the following disclaimer.
10			*
11			* 2. Redistributions in binary form must reproduce the above copyright notice,
12			* this list of conditions and the following disclaimer in the documentation
13			* and/or other materials provided with the distribution.
14			*
15			* 3. The end-user documentation included with the redistribution, if any, must
16			* include the following acknowledgment:
17			*
18			* "This product includes software developed by Douglas Thrift
19			* (http://computers.douglasthrift.net/searchengine/)."
20			*
21			* Alternately, this acknowledgment may appear in the software itself, if
22			* and wherever such third-party acknowledgments normally appear.
23			*
24			* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25			* be used to endorse or promote products derived from this software without
26			* specific prior written permission. For written permission, please visit
27			* http://www.douglasthrift.net/contact.cgi for contact information.
28			*
29			* 5. Products derived from this software may not be called "Douglas Thrift's
30			* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31			* name, without prior written permission.
32			*
33			* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34			* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35			* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36			* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37			* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38			* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39			* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40			* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41			* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42			* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43			* ============================================================================
44			*/
45			// Douglas Thrift's Search Engine Processor
46			//
47			// Douglas Thrift
48			//
49			// Processor.cpp
50
51			#include "Processor.h"
52
53			Processor::Processor()
54			{
55			page = new Page();
56			}
57
58			Processor::~Processor()
59			{
60			delete page;
61			}
62
63			bool Processor::process(HttpHandler& http, URL& url)
64			{
65			string title, description, text;
66			vector<string> headings;
67
68			if (html(http))
69			{
70			if (!process(http, url, title, description, text, headings)) return
71			false;
72
73			entities(title, " ", ' ');
74			entities(title, "<", '<');
75			entities(title, ">", '>');
76			entities(title, """, '\"');
77			entities(title, "&", '&');
78
79			entities(description, " ", ' ');
80			entities(description, "<", '<');
81			entities(description, ">", '>');
82			entities(description, """, '\"');
83			entities(description, "&", '&');
84
85			entities(text, " ", ' ');
86			entities(text, "<", '<');
87			entities(text, ">", '>');
88			entities(text, """, '\"');
89			entities(text, "&", '&');
90
91			for (int index = 0; index < headings.size(); index++)
92			{
93			entities(headings[index], " ", ' ');
94			entities(headings[index], "<", '<');
95			entities(headings[index], ">", '>');
96			entities(headings[index], """, '\"');
97			entities(headings[index], "&", '&');
98			}
99
100			normalize(title);
101			normalize(description);
102			normalize(text);
103			for (int index0 = 0; index0 < headings.size(); index0++)
104			{
105			normalize(headings[index0]);
106			}
107			}
108			else
109			{
110			bool knowSize = page->getSize() > 0;
111
112			string line;
113			while (http.good())
114			{
115			http.getline(line);
116
117			text += line + "\n";
118
119			if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
120			}
121
122			normalize(text);
123			}
124
125			page->setURL(url);
126			page->setTitle(title);
127			page->setDescription(description);
128			page->setText(text);
129			page->setHeadings(headings);
130
131			return true;
132			}
133
134			void Processor::reset()
135			{
136			links.clear();
137			delete page;
138			page = new Page();
139			}
140
141			bool Processor::process(HttpHandler& http, URL& url, string& title, string&
142			description, string& text, vector<string>& headings)
143			{
144			bool inHtml = false, inHead = false, inTitle = false, inBody = false,
145			inHeading = false, inComment = false, knowSize = page->getSize() > 0,
146			follow = true, answer = true;
147			unsigned startComment = 0, finishComment = 0;
148			string line;
149			while (http.good())
150			{
151			http.getline(line);
152			string heading;
153
154			unsigned begin = 0;
155			while (begin < line.length())
156			{
157			unsigned open = line.find('<', begin);
158			unsigned close = line.find('>', begin);
159
160			string next;
161			while (close == string::npos)
162			{
163			http.getline(next);
164			line += '\n' + next;
165			close = line.find('>', begin);
166			}
167
168			// strangely this is necessary sometimes
169			if (open == string::npos) open = line.find('<', begin);
170
171			string between = line.substr(begin, open - begin);
172			string tag = getTag(line, open, close);
173			string lowerTag(tag.length(), ' ');
174
175			for (unsigned index = 0; index < tag.length(); index++)
176			{
177			lowerTag[index] = tolower(tag[index]);
178			}
179
180			if (inHtml && !inComment)
181			{
182			if (inHead && inTitle)
183			{
184			title += between + "\n";
185			}
186
187			if (inBody)
188			{
189			text += between + "\n";
190			}
191
192			if (inBody && inHeading)
193			{
194			heading += between + "\n";
195			}
196			if (((lowerTag.find("meta ") == 0) \|\| (lowerTag.find("meta\n")
197			== 0) \|\| (lowerTag.find("meta ") == 0)) && inHead)
198			{
199			if (lowerTag.find("name=robots") != string::npos \|\|
200			lowerTag.find("name=\"robots\"") != string::npos)
201			{
202			unsigned start = lowerTag.find("content=\"") + 9;
203			unsigned finish = lowerTag.find('\"', start);
204
205			string robots = lowerTag.substr(start, finish - start);
206
207			if ((robots.find("noindex") != string::npos &&
208			robots.find("nofollow") != string::npos) \|\|
209			robots.find("none") != string::npos)
210			{
211			answer = false;
212			follow = false;
213			links.clear();
214
215			return answer;
216			}
217			else if (robots.find("noindex") != string::npos)
218			{
219			answer = false;
220			}
221			else if (robots.find("nofollow") != string::npos)
222			{
223			follow = false;
224			links.clear();
225			}
226			}
227			else if (lowerTag.find("name=description") != string::npos
228			\|\| lowerTag.find("name=\"description\"") !=
229			string::npos)
230			{
231			unsigned start = lowerTag.find("content=\"") + 9;
232			unsigned finish = lowerTag.find('\"', start);
233
234			description = tag.substr(start, finish - start);
235			}
236			}
237
238			if (((lowerTag.find("a ") == 0) \|\| (lowerTag.find("a\n") == 0)
239			\|\| (lowerTag.find("a ") == 0)) && inBody && follow)
240			{
241			if (lowerTag.find("href=\"") != string::npos)
242			{
243			unsigned start = lowerTag.find("href=\"") + 6;
244			unsigned finish = lowerTag.find('\"', start);
245
246	douglas	15	string link = getLink(tag.substr(start, finish -
247			start), url);
248	douglas	1
249			if (link != "bad link") links.insert(link);
250			}
251			else if (lowerTag.find("href=") != string::npos)
252			{
253			unsigned start = lowerTag.find("href=") + 5;
254			unsigned finish = lowerTag.find(' ', start);
255
256			if (finish < close)
257			{
258	douglas	15	string link = getLink(tag.substr(start, finish -
259			start), url);
260	douglas	1
261			if (link != "bad link") links.insert(link);
262			}
263			else
264			{
265	douglas	15	string link = getLink(tag.substr(start, close -
266			start), url);
267	douglas	1
268			if (link != "bad link") links.insert(link);
269			}
270			}
271			}
272
273			if ((lowerTag.find("img ") == 0) \|\| (lowerTag.find("img\n") ==
274			0) \|\| (lowerTag.find("img ")) && inBody)
275			{
276			if (lowerTag.find("alt=\"") != string::npos)
277			{
278			unsigned start = lowerTag.find("alt=\"") + 5;
279			unsigned finish = lowerTag.find('\"', start);
280
281			text += tag.substr(start, finish - start) + ' ';
282			if (inHeading) heading += tag.substr(start, finish -
283			start) + ' ';
284			}
285			else if (lowerTag.find("alt=") != string::npos)
286			{
287			unsigned start = lowerTag.find("alt=") + 4;
288			unsigned finish = lowerTag.find(' ', start);
289
290			if (finish < close)
291			{
292			text += tag.substr(start, finish - start) + ' ';
293			if (inHeading) heading += tag.substr(start, finish
294			- start) + ' ';
295			}
296			else
297			{
298			text += tag.substr(start, close - start) + ' ';
299			if (inHeading) heading += tag.substr(start, close -
300			start) + ' ';
301			}
302			}
303			}
304			}
305
306			if (lowerTag.find("html") == 0) inHtml = true;
307			if (lowerTag.find("/html") == 0) inHtml = false;
308
309			if (lowerTag.find("head") == 0) inHead = true;
310			if (lowerTag.find("/head") == 0) inHead = false;
311
312			if (lowerTag.find("title") == 0) inTitle = true;
313			if (lowerTag.find("/title") == 0) inTitle = false;
314
315			if (lowerTag.find("body") == 0 \|\| lowerTag.find("noframes") == 0)
316			inBody = true;
317			if (lowerTag.find("/body") == 0 \|\| lowerTag.find("/noframes") == 0)
318			inBody = false;
319
320			if (lowerTag.find("h1") == 0 \|\| lowerTag.find("h2") == 0 \|\|
321			lowerTag.find("h3") == 0 \|\| lowerTag.find("h4") == 0 \|\|
322			lowerTag.find("h5") == 0 \|\| lowerTag.find("h6") == 0)
323			inHeading = true;
324			if (lowerTag.find("/h1") == 0 \|\| lowerTag.find("/h2") == 0 \|\|
325			lowerTag.find("/h3") == 0 \|\| lowerTag.find("/h4") == 0 \|\|
326			lowerTag.find("/h5") == 0 \|\| lowerTag.find("/h6") == 0)
327			{
328			if (heading != "") headings.push_back(heading);
329			inHeading = false;
330			}
331
332			if (lowerTag.find("!--") == 0)
333			{
334			startComment = open;
335			inComment = true;
336			}
337			if (line.find("-->", begin) >= startComment && line.find("-->",
338			begin) != string::npos)
339			{
340			finishComment = line.find("-->", begin) + 3;
341			inComment = false;
342			}
343
344			if (close == string::npos)
345			{
346			begin = close;
347			}
348			else
349			{
350			begin = close + 1;
351			}
352			}
353
354			startComment = 0;
355			finishComment = 0;
356
357			if (!knowSize) page->setSize(page->getSize() + line.length() + 1);
358			}
359
360			return answer;
361			}
362
363			bool Processor::html(HttpHandler& http)
364			{
365			bool answer = false;
366
367			string line;
368			http.getline(line);
369
370			while (http.good())
371			{
372			string field;
373			http.getline(field, ' ');
374			if (field == "") break;
375			http.getline(line);
376
377			if (field == "Content-Type:" \|\| field == "Content-type:")
378			{
379			if (line.find("text/html") != string::npos)
380			{
381			answer = true;
382			}
383			}
384
385			if (field == "Content-Length:" \|\| field == "Content-length:")
386			{
387			page->setSize(strtoul(line.c_str(), 0, 0));
388			}
389			}
390
391			return answer;
392			}
393
394	douglas	15	string Processor::getTag(const string& line, unsigned open, unsigned close)
395	douglas	1	{
396			string tag = line.substr(open + 1, close - open - 1);
397
398			return tag;
399			}
400
401	douglas	15	string Processor::getLink(string link, URL& url)
402	douglas	1	{
403			string hyperlink = "bad link";
404
405			if (link.find('#') != string::npos)
406			{
407			unsigned pound = link.find('#');
408			link.erase(pound);
409			}
410
411			if (link.find("://") != string::npos)
412			{
413			if (link.find("http://") == 0) hyperlink = link;
414			}
415			else if (link.find("mailto:") == 0)
416			{
417			// do nothing we are not evil spammers!
418			}
419			else if (link.find("//") == 0)
420			{
421			hyperlink = "http:" + link;
422			}
423			else if (link.find('/') == 0)
424			{
425			hyperlink = url.getURL();
426
427			unsigned path = hyperlink.find('/', 7);
428			hyperlink.erase(path);
429
430			hyperlink += link;
431			}
432			else if (link == "")
433			{
434			// a blank link is useless
435			}
436			else
437			{
438			hyperlink = url.getURL();
439			string path = url.getPath();
440
441			unsigned cutoff = hyperlink.rfind(path);
442			hyperlink.erase(cutoff);
443
444			unsigned dir = path.rfind('/') + 1;
445			path.erase(dir);
446
447			while (link.find("../") == 0)
448			{
449			unsigned dot = path.rfind('/') - 1;
450			unsigned up = path.rfind('/', dot) + 1;
451
452			path.erase(up);
453			link.erase(0, 3);
454			}
455			while (link.find("./") == 0)
456			{
457			link.erase(0, 2);
458			}
459
460			hyperlink += path + link;
461			}
462
463			return hyperlink;
464			}