46 |
|
// |
47 |
|
// Douglas Thrift |
48 |
|
// |
49 |
< |
// $Id: Processor.cpp,v 1.10 2004/02/21 02:58:09 douglas Exp $ |
49 |
> |
// $Id: Processor.cpp,v 1.11 2004/03/09 06:21:46 douglas Exp $ |
50 |
|
|
51 |
|
#include "Processor.h" |
52 |
|
|
75 |
|
entities(title, ">", '>'); |
76 |
|
entities(title, """, '\"'); |
77 |
|
entities(title, "&", '&'); |
78 |
– |
|
78 |
|
entities(description, " ", ' '); |
79 |
|
entities(description, "<", '<'); |
80 |
|
entities(description, ">", '>'); |
81 |
|
entities(description, """, '\"'); |
82 |
|
entities(description, "&", '&'); |
84 |
– |
|
83 |
|
entities(text, " ", ' '); |
84 |
|
entities(text, "<", '<'); |
85 |
|
entities(text, ">", '>'); |
98 |
|
normalize(title); |
99 |
|
normalize(description); |
100 |
|
normalize(text); |
101 |
+ |
|
102 |
|
for (int index0 = 0; index0 < headings.size(); index0++) |
103 |
|
{ |
104 |
|
normalize(headings[index0]); |
107 |
|
else |
108 |
|
{ |
109 |
|
string line; |
110 |
+ |
|
111 |
|
while (http.good()) |
112 |
|
{ |
113 |
|
http.getline(line); |
131 |
|
void Processor::reset() |
132 |
|
{ |
133 |
|
links.clear(); |
134 |
+ |
|
135 |
|
delete page; |
136 |
+ |
|
137 |
|
page = new Page(); |
138 |
|
} |
139 |
|
|
142 |
|
{ |
143 |
|
bool inHtml = false, inHead = false, inTitle = false, inBody = false, |
144 |
|
inHeading = false, inComment = false, follow = true, answer = true; |
143 |
– |
unsigned startComment = 0, finishComment = 0; |
145 |
|
string line, heading; |
146 |
|
|
147 |
|
while (http.good()) |
148 |
|
{ |
149 |
|
http.getline(line); |
150 |
|
|
151 |
< |
unsigned begin = 0; |
151 |
> |
unsigned begin = 0, startComment = 0; |
152 |
> |
|
153 |
|
while (begin < line.length()) |
154 |
|
{ |
155 |
|
unsigned open = line.find('<', begin); |
156 |
|
unsigned close = line.find('>', begin); |
155 |
– |
|
157 |
|
string next; |
158 |
+ |
|
159 |
|
while (close == string::npos && http.good()) |
160 |
|
{ |
161 |
|
http.getline(next); |
191 |
|
{ |
192 |
|
heading += between + "\n"; |
193 |
|
} |
194 |
+ |
|
195 |
|
if (((lowerTag.find("meta ") == 0) || (lowerTag.find("meta\n") |
196 |
|
== 0) || (lowerTag.find("meta ") == 0)) && inHead) |
197 |
|
{ |
304 |
|
|
305 |
|
if (lowerTag.find("html") == 0) inHtml = true; |
306 |
|
if (lowerTag.find("/html") == 0) inHtml = false; |
304 |
– |
|
307 |
|
if (lowerTag.find("head") == 0) inHead = true; |
308 |
|
if (lowerTag.find("/head") == 0) inHead = false; |
307 |
– |
|
309 |
|
if (lowerTag.find("title") == 0) inTitle = true; |
310 |
|
if (lowerTag.find("/title") == 0) inTitle = false; |
310 |
– |
|
311 |
|
if (lowerTag.find("body") == 0 || lowerTag.find("noframes") == 0) |
312 |
|
inBody = true; |
313 |
|
if (lowerTag.find("/body") == 0 || lowerTag.find("/noframes") == 0) |
320 |
|
heading = ""; |
321 |
|
inHeading = true; |
322 |
|
} |
323 |
+ |
|
324 |
|
if (lowerTag.find("/h1") == 0 || lowerTag.find("/h2") == 0 || |
325 |
|
lowerTag.find("/h3") == 0 || lowerTag.find("/h4") == 0 || |
326 |
|
lowerTag.find("/h5") == 0 || lowerTag.find("/h6") == 0) |
327 |
|
{ |
328 |
|
if (heading != "") headings.push_back(heading); |
329 |
+ |
|
330 |
|
inHeading = false; |
331 |
|
} |
332 |
|
|
335 |
|
startComment = open; |
336 |
|
inComment = true; |
337 |
|
} |
338 |
+ |
|
339 |
|
if (line.find("-->", begin) >= startComment && line.find("-->", |
340 |
|
begin) != string::npos) |
341 |
|
{ |
342 |
< |
finishComment = line.find("-->", begin) + 3; |
342 |
> |
close = line.find("-->", begin) + 3; |
343 |
|
inComment = false; |
344 |
|
} |
345 |
|
|
352 |
|
begin = close + 1; |
353 |
|
} |
354 |
|
} |
352 |
– |
|
353 |
– |
startComment = 0; |
354 |
– |
finishComment = 0; |
355 |
|
} |
356 |
|
|
357 |
|
return answer; |