1 |
|
/* ============================================================================ |
2 |
|
* Douglas Thrift's Search Engine License |
3 |
|
* |
4 |
< |
* Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. |
4 |
> |
* Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved. |
5 |
|
* Redistribution and use in source and binary forms, with or without |
6 |
|
* modification, are permitted provided that the following conditions are met: |
7 |
|
* |
46 |
|
// |
47 |
|
// Douglas Thrift |
48 |
|
// |
49 |
< |
// $Id: Indexer.cpp,v 1.12 2003/07/15 01:30:46 douglas Exp $ |
49 |
> |
// $Id: Indexer.cpp,v 1.16 2004/01/01 23:00:34 douglas Exp $ |
50 |
|
|
51 |
|
#include "Indexer.h" |
52 |
|
|
77 |
|
ofstream fout(dtd.c_str()); |
78 |
|
|
79 |
|
fout << "<!ELEMENT index (page*)>\n" |
80 |
< |
<< "<!ELEMENT page (address, port?, path, title?, description?, ke" |
81 |
< |
<< "ywords?, text,\n" |
82 |
< |
<< " heading*)\n" |
83 |
< |
<< ">\n" |
80 |
> |
<< "<!ELEMENT page (address, port?, tls?, path, title?, descriptio" |
81 |
> |
<< "n?, keywords?, text, heading*)>\n" |
82 |
|
<< "<!ELEMENT address (#PCDATA)>\n" |
83 |
|
<< "<!ELEMENT port (#PCDATA)>\n" |
84 |
+ |
<< "<!ELEMENT tls (#PCDATA)>\n" |
85 |
|
<< "<!ELEMENT path (#PCDATA)>\n" |
86 |
|
<< "<!ELEMENT size (#PCDATA)>\n" |
87 |
|
<< "<!ELEMENT title (#PCDATA)>\n" |
117 |
|
|
118 |
|
void Indexer::index(URL& url, ofstream& fout, const string referer) |
119 |
|
{ |
120 |
< |
if (domains.find(url.getAddress() += url.getPort() != 80 ? ":" + |
121 |
< |
url.getPort() : "") != domains.end() && pages.find(url.getURL()) == |
123 |
< |
pages.end()) |
120 |
> |
if (domains.find(url.getAddress()) != domains.end() && |
121 |
> |
pages.find(url.getURL()) == pages.end()) |
122 |
|
{ |
123 |
< |
if (checked.find(url.getAddress() += url.getPort() != 80 ? ":" + |
124 |
< |
url.getPort() : "") == checked.end()) |
123 |
> |
if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" + |
124 |
> |
url.getPort() : string(""))) == checked.end()) |
125 |
|
{ |
126 |
|
robots(url); |
127 |
|
} |
136 |
|
http.clear(); |
137 |
|
if (!http.handle(url, referer)) exit(1); |
138 |
|
|
139 |
< |
cout << "Indexing " << url << "..." << flush; |
139 |
> |
cout << "Indexing " << url << " ... " << flush; |
140 |
|
|
141 |
|
if (processor.process(http, url)) |
142 |
|
{ |
226 |
|
|
227 |
|
if (http.handle(robots)) |
228 |
|
{ |
229 |
< |
cout << "Checking " << robots << "..." << flush; |
229 |
> |
cout << "Checking " << robots << " ... " << flush; |
230 |
|
|
231 |
|
string line; |
232 |
|
|