1 |
|
/* ============================================================================ |
2 |
|
* Douglas Thrift's Search Engine License |
3 |
|
* |
4 |
< |
* Copyright (C) 2002, Douglas Thrift. All Rights Reserved. |
4 |
> |
* Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved. |
5 |
|
* Redistribution and use in source and binary forms, with or without |
6 |
|
* modification, are permitted provided that the following conditions are met: |
7 |
|
* |
46 |
|
// |
47 |
|
// Douglas Thrift |
48 |
|
// |
49 |
< |
// Indexer.cpp |
49 |
> |
// $Id: Indexer.cpp,v 1.12 2003/07/15 01:30:46 douglas Exp $ |
50 |
|
|
51 |
|
#include "Indexer.h" |
52 |
|
|
53 |
< |
Indexer::Indexer(string& indexFile, set<string>& domains, |
54 |
< |
set<string>& restrictions) |
53 |
> |
#ifndef _WIN32 |
54 |
> |
#include <unistd.h> |
55 |
> |
#else // _WIN32 |
56 |
> |
inline int unlink(const char* filename) { return DeleteFile(filename); } |
57 |
> |
#endif // _WIN32 |
58 |
> |
|
59 |
> |
Indexer::Indexer(string& indexFile, set<string>& domains, set<string>& |
60 |
> |
restrictions) |
61 |
|
{ |
62 |
|
this->indexFile = indexFile; |
63 |
|
this->domains = domains; |
66 |
|
|
67 |
|
void Indexer::index(string& begin) |
68 |
|
{ |
69 |
< |
ofstream fout(indexFile.c_str()); |
69 |
> |
unsigned separator = indexFile.rfind(slash); |
70 |
> |
string dtd = separator != string::npos ? indexFile.substr(0, separator) + |
71 |
> |
slash + "index.dtd" : "index.dtd"; |
72 |
> |
|
73 |
> |
ifstream fin(dtd.c_str()); |
74 |
> |
|
75 |
> |
if (!fin.is_open()) |
76 |
> |
{ |
77 |
> |
ofstream fout(dtd.c_str()); |
78 |
> |
|
79 |
> |
fout << "<!ELEMENT index (page*)>\n" |
80 |
> |
<< "<!ELEMENT page (address, port?, path, title?, description?, ke" |
81 |
> |
<< "ywords?, text,\n" |
82 |
> |
<< " heading*)\n" |
83 |
> |
<< ">\n" |
84 |
> |
<< "<!ELEMENT address (#PCDATA)>\n" |
85 |
> |
<< "<!ELEMENT port (#PCDATA)>\n" |
86 |
> |
<< "<!ELEMENT path (#PCDATA)>\n" |
87 |
> |
<< "<!ELEMENT size (#PCDATA)>\n" |
88 |
> |
<< "<!ELEMENT title (#PCDATA)>\n" |
89 |
> |
<< "<!ELEMENT description (#PCDATA)>\n" |
90 |
> |
<< "<!ELEMENT text (#PCDATA)>\n" |
91 |
> |
<< "<!ELEMENT heading (#PCDATA)>\n"; |
92 |
> |
|
93 |
> |
fout.close(); |
94 |
> |
} |
95 |
> |
|
96 |
> |
fin.close(); |
97 |
> |
|
98 |
> |
string lock = indexFile + ".lock"; |
99 |
> |
|
100 |
> |
ofstream fout(lock.c_str()); |
101 |
> |
fout.close(); |
102 |
> |
fout.open(indexFile.c_str()); |
103 |
|
|
104 |
|
fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>" |
105 |
|
<< "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n" |
112 |
|
fout << "</index>\n"; |
113 |
|
|
114 |
|
fout.close(); |
115 |
+ |
|
116 |
+ |
unlink(lock.c_str()); |
117 |
|
} |
118 |
|
|
119 |
|
void Indexer::index(URL& url, ofstream& fout, const string referer) |
137 |
|
{ |
138 |
|
http.clear(); |
139 |
|
if (!http.handle(url, referer)) exit(1); |
140 |
< |
|
140 |
> |
|
141 |
|
cout << "Indexing " << url << "..." << flush; |
142 |
|
|
143 |
|
if (processor.process(http, url)) |