1 |
|
/* ============================================================================ |
2 |
|
* Douglas Thrift's Search Engine License |
3 |
|
* |
4 |
< |
* Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved. |
4 |
> |
* Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved. |
5 |
|
* Redistribution and use in source and binary forms, with or without |
6 |
|
* modification, are permitted provided that the following conditions are met: |
7 |
|
* |
136 |
|
{ |
137 |
|
http.getline(line); |
138 |
|
|
139 |
< |
unsigned begin(0), startComment(0); |
139 |
> |
size_t begin(0), startComment(0); |
140 |
|
|
141 |
|
while (begin < line.length()) |
142 |
|
{ |
143 |
< |
unsigned open(line.find('<', begin)), close(line.find('>', begin)); |
143 |
> |
size_t open(line.find('<', begin)), close(line.find('>', begin)); |
144 |
|
string next; |
145 |
|
|
146 |
|
while (close == string::npos && http.good()) |
179 |
|
== 0) || (lowerTag.find("meta ") == 0)) && inHead) |
180 |
|
{ |
181 |
|
if (lowerTag.find("name=robots") != string::npos || |
182 |
< |
lowerTag.find("name=\"robots\"") != string::npos) |
182 |
> |
lowerTag.find("name=\"robots\"") != string::npos || |
183 |
> |
lowerTag.find("name='robots'") != string::npos) |
184 |
|
{ |
185 |
< |
unsigned start(lowerTag.find("content=\"") + 9), |
185 |
> |
size_t start(lowerTag.find("content=\"") + 9), |
186 |
|
finish(lowerTag.find('\"', start)); |
187 |
|
string robots(lowerTag.substr(start, finish - start)); |
188 |
|
|
209 |
|
} |
210 |
|
} |
211 |
|
else if (lowerTag.find("name=description") != string::npos |
212 |
< |
|| lowerTag.find("name=\"description\"") != |
213 |
< |
string::npos) |
212 |
> |
|| lowerTag.find("name=\"description\"") != string::npos |
213 |
> |
|| lowerTag.find("name='description'") != string::npos) |
214 |
|
{ |
215 |
< |
unsigned start(lowerTag.find("content=\"") + 9), |
215 |
> |
size_t start(lowerTag.find("content=\"") + 9), |
216 |
|
finish(lowerTag.find('\"', start)); |
217 |
|
|
218 |
|
description = tag.substr(start, finish - start); |
224 |
|
{ |
225 |
|
if (lowerTag.find("href=\"") != string::npos) |
226 |
|
{ |
227 |
< |
unsigned start(lowerTag.find("href=\"") + 6), |
227 |
> |
size_t start(lowerTag.find("href=\"") + 6), |
228 |
|
finish(lowerTag.find('\"', start)); |
229 |
+ |
string link(getLink(tag.substr(start, finish - |
230 |
+ |
start), url)); |
231 |
|
|
232 |
< |
string link = getLink(tag.substr(start, finish - |
233 |
< |
start), url); |
232 |
> |
if (!link.empty()) links.insert(link); |
233 |
> |
} |
234 |
> |
else if (lowerTag.find("href='") != string::npos) |
235 |
> |
{ |
236 |
> |
size_t start(lowerTag.find("href='") + 6), |
237 |
> |
finish(lowerTag.find('\'', start)); |
238 |
> |
string link(getLink(tag.substr(start, finish - |
239 |
> |
start), url)); |
240 |
|
|
241 |
< |
if (link != "") links.insert(link); |
241 |
> |
if (!link.empty()) links.insert(link); |
242 |
|
} |
243 |
|
else if (lowerTag.find("href=") != string::npos) |
244 |
|
{ |
245 |
< |
unsigned start(lowerTag.find("href=") + 5), |
245 |
> |
size_t start(lowerTag.find("href=") + 5), |
246 |
|
finish(lowerTag.find(' ', start)); |
247 |
|
|
248 |
|
if (finish < close) |
249 |
|
{ |
250 |
< |
string link = getLink(tag.substr(start, finish - |
251 |
< |
start), url); |
250 |
> |
string link(getLink(tag.substr(start, finish - |
251 |
> |
start), url)); |
252 |
|
|
253 |
< |
if (link != "") links.insert(link); |
253 |
> |
if (!link.empty()) links.insert(link); |
254 |
|
} |
255 |
|
else |
256 |
|
{ |
257 |
< |
string link = getLink(tag.substr(start, close - |
258 |
< |
start), url); |
257 |
> |
string link(getLink(tag.substr(start, close - |
258 |
> |
start), url)); |
259 |
|
|
260 |
< |
if (link != "") links.insert(link); |
260 |
> |
if (!link.empty()) links.insert(link); |
261 |
|
} |
262 |
|
} |
263 |
|
} |
267 |
|
{ |
268 |
|
if (lowerTag.find("alt=\"") != string::npos) |
269 |
|
{ |
270 |
< |
unsigned start(lowerTag.find("alt=\"") + 5), |
270 |
> |
size_t start(lowerTag.find("alt=\"") + 5), |
271 |
|
finish(lowerTag.find('\"', start)); |
272 |
|
|
273 |
|
text += tag.substr(start, finish - start) + ' '; |
277 |
|
} |
278 |
|
else if (lowerTag.find("alt=") != string::npos) |
279 |
|
{ |
280 |
< |
unsigned start(lowerTag.find("alt=") + 4), |
280 |
> |
size_t start(lowerTag.find("alt=") + 4), |
281 |
|
finish(lowerTag.find(' ', start)); |
282 |
|
|
283 |
|
if (finish < close) |
313 |
|
lowerTag.find("h3") == 0 || lowerTag.find("h4") == 0 || |
314 |
|
lowerTag.find("h5") == 0 || lowerTag.find("h6") == 0) |
315 |
|
{ |
316 |
< |
heading = ""; |
316 |
> |
heading.erase(); |
317 |
> |
|
318 |
|
inHeading = true; |
319 |
|
} |
320 |
|
|
322 |
|
lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 || |
323 |
|
lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0) |
324 |
|
{ |
325 |
< |
if (heading != "") headings.push_back(heading); |
325 |
> |
if (!heading.empty()) headings.push_back(heading); |
326 |
|
|
327 |
|
inHeading = false; |
328 |
|
} |
336 |
|
if (line.find("-->", begin) >= startComment && line.find("-->", |
337 |
|
begin) != string::npos) |
338 |
|
{ |
339 |
< |
close = line.find("-->", begin) + 3; |
339 |
> |
close = line.find("-->", begin) + 2; |
340 |
|
inComment = false; |
341 |
|
} |
342 |
|
|
354 |
|
return answer; |
355 |
|
} |
356 |
|
|
357 |
< |
string Processor::getTag(const string& line, unsigned open, unsigned close) |
357 |
> |
string Processor::getTag(const string& line, size_t open, size_t close) |
358 |
|
{ |
359 |
|
return line.substr(open + 1, close - open - 1); |
360 |
|
} |