1 |
/* ============================================================================ |
2 |
* Douglas Thrift's Search Engine License |
3 |
* |
4 |
* Copyright (C) 2002, Douglas Thrift. All Rights Reserved. |
5 |
* Redistribution and use in source and binary forms, with or without |
6 |
* modification, are permitted provided that the following conditions are met: |
7 |
* |
8 |
* 1. Redistributions of source code must retain the above copyright notice, |
9 |
* this list of conditions and the following disclaimer. |
10 |
* |
11 |
* 2. Redistributions in binary form must reproduce the above copyright notice, |
12 |
* this list of conditions and the following disclaimer in the documentation |
13 |
* and/or other materials provided with the distribution. |
14 |
* |
15 |
* 3. The end-user documentation included with the redistribution, if any, must |
16 |
* include the following acknowledgment: |
17 |
* |
18 |
* "This product includes software developed by Douglas Thrift |
19 |
* (http://computers.douglasthrift.net/searchengine/)." |
20 |
* |
21 |
* Alternately, this acknowledgment may appear in the software itself, if |
22 |
* and wherever such third-party acknowledgments normally appear. |
23 |
* |
24 |
* 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not |
25 |
* be used to endorse or promote products derived from this software without |
26 |
* specific prior written permission. For written permission, please visit |
27 |
* http://www.douglasthrift.net/contact.cgi for contact information. |
28 |
* |
29 |
* 5. Products derived from this software may not be called "Douglas Thrift's |
30 |
* Search Engine", nor may "Douglas Thrift's Search Engine" appear in their |
31 |
* name, without prior written permission. |
32 |
* |
33 |
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
34 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
35 |
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
36 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
37 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
38 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, |
39 |
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
40 |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
41 |
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
42 |
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
43 |
* ============================================================================ |
44 |
*/ |
45 |
// Douglas Thrift's Search Engine URL |
46 |
// |
47 |
// Douglas Thrift |
48 |
// |
49 |
// URL.cpp |
50 |
|
51 |
#include "URL.h" |
52 |
|
53 |
URL::URL(const string& url) |
54 |
{ |
55 |
setURL(url); |
56 |
} |
57 |
|
58 |
URL::URL(const string& address, unsigned port, const string& path) |
59 |
{ |
60 |
this->address = address; |
61 |
this->port = port; |
62 |
this->path = path; |
63 |
} |
64 |
|
65 |
string URL::getURL() |
66 |
{ |
67 |
string url = "http://" + address; |
68 |
|
69 |
if (port != 80) |
70 |
{ |
71 |
char* cport = new char[1024]; |
72 |
|
73 |
sprintf(cport, "%u", port); |
74 |
|
75 |
url += string(":") + cport; |
76 |
|
77 |
delete [] cport; |
78 |
} |
79 |
|
80 |
url += path; |
81 |
|
82 |
return url; |
83 |
} |
84 |
|
85 |
void URL::setURL(const URL& url) |
86 |
{ |
87 |
this->address = url.address; |
88 |
this->port = url.port; |
89 |
this->path = url.path; |
90 |
} |
91 |
|
92 |
void URL::setURL(const string& url) |
93 |
{ |
94 |
if (url.find("http://") || url.length() <= 7) |
95 |
{ |
96 |
cerr << program << ": Malformed URL: " << url << "\n"; |
97 |
exit(1); |
98 |
} |
99 |
|
100 |
int begin = 7; |
101 |
int colon = url.find(':', begin); |
102 |
int end = url.find('/', begin); |
103 |
|
104 |
if (colon != string::npos && colon < end) |
105 |
{ |
106 |
address = url.substr(begin, colon - begin); |
107 |
port = strtoul(url.substr(colon + 1, end - colon - 1).c_str(), 0, 0); |
108 |
} |
109 |
else |
110 |
{ |
111 |
address = url.substr(begin, end - begin); |
112 |
port = 80; |
113 |
} |
114 |
|
115 |
if (end == string::npos) |
116 |
{ |
117 |
path = "/"; |
118 |
} |
119 |
else |
120 |
{ |
121 |
path = url.substr(end); |
122 |
} |
123 |
} |
124 |
|
125 |
void URL::setAddress(const string& address) |
126 |
{ |
127 |
this->address = address; |
128 |
} |
129 |
|
130 |
void URL::setPort(unsigned port) |
131 |
{ |
132 |
this->port = port; |
133 |
} |
134 |
|
135 |
void URL::setPath(const string& path) |
136 |
{ |
137 |
if (path.find('/') != 0) |
138 |
{ |
139 |
this->path = "/" + path; |
140 |
} |
141 |
else |
142 |
{ |
143 |
this->path = path; |
144 |
} |
145 |
} |
146 |
|
147 |
ostream& operator<<(ostream& os, URL& data) |
148 |
{ |
149 |
os << data.getURL(); |
150 |
|
151 |
return os; |
152 |
} |
153 |
|
154 |
string getLink(string link, URL& url) |
155 |
{ |
156 |
string hyperlink = ""; |
157 |
|
158 |
if (link.find('#') != string::npos) |
159 |
{ |
160 |
unsigned pound = link.find('#'); |
161 |
link.erase(pound); |
162 |
} |
163 |
|
164 |
if (link.find("://") != string::npos) |
165 |
{ |
166 |
if (link.find("http://") == 0) hyperlink = link; |
167 |
} |
168 |
else if (link.find("mailto:") == 0) |
169 |
{ |
170 |
// do nothing we are not evil spammers! |
171 |
} |
172 |
else if (link.find("news:") == 0) |
173 |
{ |
174 |
// do nothing this isn't Google Groups |
175 |
} |
176 |
else if (link.find("//") == 0) |
177 |
{ |
178 |
hyperlink = "http:" + link; |
179 |
} |
180 |
else if (link.find('/') == 0) |
181 |
{ |
182 |
hyperlink = url.getURL(); |
183 |
|
184 |
unsigned path = hyperlink.find('/', 7); |
185 |
hyperlink.erase(path); |
186 |
|
187 |
hyperlink += link; |
188 |
} |
189 |
else if (link == "") |
190 |
{ |
191 |
// a blank link is useless |
192 |
} |
193 |
else |
194 |
{ |
195 |
hyperlink = url.getURL(); |
196 |
string path = url.getPath(); |
197 |
|
198 |
unsigned cutoff = hyperlink.rfind(path); |
199 |
hyperlink.erase(cutoff); |
200 |
|
201 |
unsigned dir = path.rfind('/') + 1; |
202 |
path.erase(dir); |
203 |
|
204 |
while (link.find("../") == 0) |
205 |
{ |
206 |
unsigned dot = path.rfind('/') - 1; |
207 |
unsigned up = path.rfind('/', dot) + 1; |
208 |
|
209 |
path.erase(up); |
210 |
link.erase(0, 3); |
211 |
} |
212 |
while (link.find("./") == 0) |
213 |
{ |
214 |
link.erase(0, 2); |
215 |
} |
216 |
|
217 |
hyperlink += path + link; |
218 |
} |
219 |
|
220 |
return hyperlink; |
221 |
} |