trunk/Search/URL.cpp

/* ============================================================================
 * Douglas Thrift's Search Engine License
 *
 * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any, must
 *    include the following acknowledgment:
 *
 *       "This product includes software developed by Douglas Thrift
 *       (http://computers.douglasthrift.net/searchengine/)."
 *
 *    Alternately, this acknowledgment may appear in the software itself, if
 *    and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.  For written permission, please visit
 *    http://www.douglasthrift.net/contact.cgi for contact information.
 *
 * 5. Products derived from this software may not be called "Douglas Thrift's
 *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
 *    name, without prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ============================================================================
 */
// Douglas Thrift's Search Engine URL
//
// Douglas Thrift
//
// URL.cpp

#include "URL.h"

URL::URL(const string& url)
{
        setURL(url);
}

URL::URL(const string& address, unsigned port, const string& path)
{
        this->address = address;
        this->port = port;
        this->path = path;
}

string URL::getURL()
{
        string url = "http://" + address;

        if (port != 80)
        {
                char* cport = new char[1024];

                sprintf(cport, "%u", port);

                url += string(":") + cport;

                delete [] cport;
        }

        url += path;

        return url;
}

void URL::setURL(const URL& url)
{
        this->address = url.address;
        this->port = url.port;
        this->path = url.path;
}

void URL::setURL(const string& url)
{
        if (url.find("http://") || url.length() <= 7)
        {
                cerr << program << ": Malformed URL: " << url << "\n";
                exit(1);
        }

        int begin = 7;
        int colon = url.find(':', begin);
        int end = url.find('/', begin);

        if (colon != string::npos && colon < end)
        {
                address = url.substr(begin, colon - begin);
                port = strtoul(url.substr(colon + 1, end - colon - 1).c_str(), 0, 0);
        }
        else
        {
                address = url.substr(begin, end - begin);
                port = 80;
        }

        if (end == string::npos)
        {
                path = "/";
        }
        else
        {
                path = url.substr(end);
        }
}

void URL::setAddress(const string& address)
{
        this->address = address;
}

void URL::setPort(unsigned port)
{
        this->port = port;
}

void URL::setPath(const string& path)
{
        if (path.find('/') != 0)
        {
                this->path = "/" + path;
        }
        else
        {
                this->path = path;
        }
}

ostream& operator<<(ostream& os, URL& data)
{
        os << data.getURL();

        return os;
}

string getLink(string link, URL& url)
{
        string hyperlink = "";

        if (link.find('#') != string::npos)
        {
                unsigned pound = link.find('#');
                link.erase(pound);
        }

        if (link.find("://") != string::npos)
        {
                if (link.find("http://") == 0) hyperlink = link;
        }
        else if (link.find("mailto:") == 0)
        {
                // do nothing we are not evil spammers!
        }
        else if (link.find("news:") == 0)
        {
                // do nothing this isn't Google Groups
        }
        else if (link.find("//") == 0)
        {
                hyperlink = "http:" + link;
        }
        else if (link.find('/') == 0)
        {
                hyperlink = url.getURL();

                unsigned path = hyperlink.find('/', 7);
                hyperlink.erase(path);

                hyperlink += link;
        }
        else if (link == "")
        {
                // a blank link is useless
        }
        else
        {
                hyperlink = url.getURL();
                string path = url.getPath();

                unsigned cutoff = hyperlink.rfind(path);
                hyperlink.erase(cutoff);

                unsigned dir = path.rfind('/') + 1;
                path.erase(dir);

                while (link.find("../") == 0)
                {
                        unsigned dot = path.rfind('/') - 1;
                        unsigned up = path.rfind('/', dot) + 1;

                        path.erase(up);
                        link.erase(0, 3);
                }
                while (link.find("./") == 0)
                {
                        link.erase(0, 2);
                }

                hyperlink += path + link;
        }

        return hyperlink;
}
Revision:	18
Committed:	2002-12-09T21:40:12-08:00 (22 years, 6 months ago) by douglas
File size:	5182 byte(s)
Log Message:	Implemented more HttpHandler stuff. Added news: protocol to those ignored by getLink().
#	Content
1	/* ============================================================================
2	* Douglas Thrift's Search Engine License
3	*
4	* Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions are met:
7	*
8	* 1. Redistributions of source code must retain the above copyright notice,
9	* this list of conditions and the following disclaimer.
10	*
11	* 2. Redistributions in binary form must reproduce the above copyright notice,
12	* this list of conditions and the following disclaimer in the documentation
13	* and/or other materials provided with the distribution.
14	*
15	* 3. The end-user documentation included with the redistribution, if any, must
16	* include the following acknowledgment:
17	*
18	* "This product includes software developed by Douglas Thrift
19	* (http://computers.douglasthrift.net/searchengine/)."
20	*
21	* Alternately, this acknowledgment may appear in the software itself, if
22	* and wherever such third-party acknowledgments normally appear.
23	*
24	* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25	* be used to endorse or promote products derived from this software without
26	* specific prior written permission. For written permission, please visit
27	* http://www.douglasthrift.net/contact.cgi for contact information.
28	*
29	* 5. Products derived from this software may not be called "Douglas Thrift's
30	* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31	* name, without prior written permission.
32	*
33	* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39	* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42	* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43	* ============================================================================
44	*/
45	// Douglas Thrift's Search Engine URL
46	//
47	// Douglas Thrift
48	//
49	// URL.cpp
50
51	#include "URL.h"
52
53	URL::URL(const string& url)
54	{
55	setURL(url);
56	}
57
58	URL::URL(const string& address, unsigned port, const string& path)
59	{
60	this->address = address;
61	this->port = port;
62	this->path = path;
63	}
64
65	string URL::getURL()
66	{
67	string url = "http://" + address;
68
69	if (port != 80)
70	{
71	char* cport = new char[1024];
72
73	sprintf(cport, "%u", port);
74
75	url += string(":") + cport;
76
77	delete [] cport;
78	}
79
80	url += path;
81
82	return url;
83	}
84
85	void URL::setURL(const URL& url)
86	{
87	this->address = url.address;
88	this->port = url.port;
89	this->path = url.path;
90	}
91
92	void URL::setURL(const string& url)
93	{
94	if (url.find("http://") \|\| url.length() <= 7)
95	{
96	cerr << program << ": Malformed URL: " << url << "\n";
97	exit(1);
98	}
99
100	int begin = 7;
101	int colon = url.find(':', begin);
102	int end = url.find('/', begin);
103
104	if (colon != string::npos && colon < end)
105	{
106	address = url.substr(begin, colon - begin);
107	port = strtoul(url.substr(colon + 1, end - colon - 1).c_str(), 0, 0);
108	}
109	else
110	{
111	address = url.substr(begin, end - begin);
112	port = 80;
113	}
114
115	if (end == string::npos)
116	{
117	path = "/";
118	}
119	else
120	{
121	path = url.substr(end);
122	}
123	}
124
125	void URL::setAddress(const string& address)
126	{
127	this->address = address;
128	}
129
130	void URL::setPort(unsigned port)
131	{
132	this->port = port;
133	}
134
135	void URL::setPath(const string& path)
136	{
137	if (path.find('/') != 0)
138	{
139	this->path = "/" + path;
140	}
141	else
142	{
143	this->path = path;
144	}
145	}
146
147	ostream& operator<<(ostream& os, URL& data)
148	{
149	os << data.getURL();
150
151	return os;
152	}
153
154	string getLink(string link, URL& url)
155	{
156	string hyperlink = "";
157
158	if (link.find('#') != string::npos)
159	{
160	unsigned pound = link.find('#');
161	link.erase(pound);
162	}
163
164	if (link.find("://") != string::npos)
165	{
166	if (link.find("http://") == 0) hyperlink = link;
167	}
168	else if (link.find("mailto:") == 0)
169	{
170	// do nothing we are not evil spammers!
171	}
172	else if (link.find("news:") == 0)
173	{
174	// do nothing this isn't Google Groups
175	}
176	else if (link.find("//") == 0)
177	{
178	hyperlink = "http:" + link;
179	}
180	else if (link.find('/') == 0)
181	{
182	hyperlink = url.getURL();
183
184	unsigned path = hyperlink.find('/', 7);
185	hyperlink.erase(path);
186
187	hyperlink += link;
188	}
189	else if (link == "")
190	{
191	// a blank link is useless
192	}
193	else
194	{
195	hyperlink = url.getURL();
196	string path = url.getPath();
197
198	unsigned cutoff = hyperlink.rfind(path);
199	hyperlink.erase(cutoff);
200
201	unsigned dir = path.rfind('/') + 1;
202	path.erase(dir);
203
204	while (link.find("../") == 0)
205	{
206	unsigned dot = path.rfind('/') - 1;
207	unsigned up = path.rfind('/', dot) + 1;
208
209	path.erase(up);
210	link.erase(0, 3);
211	}
212	while (link.find("./") == 0)
213	{
214	link.erase(0, 2);
215	}
216
217	hyperlink += path + link;
218	}
219
220	return hyperlink;
221	}