ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Searcher.cpp
Revision: 37
Committed: 2003-01-16T22:24:01-08:00 (22 years, 5 months ago) by douglas
File size: 8062 byte(s)
Log Message:
Added index file locking during indexing and handling during searching.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Searcher
46 //
47 // Douglas Thrift
48 //
49 // Searcher.cpp
50
51 #include "Searcher.h"
52
53 Searcher::Searcher(string& queryString)
54 {
55 setCommon();
56 setAnd(false);
57 setOr(false);
58
59 this->queryString = queryString;
60
61 setQuery();
62 }
63
64 void Searcher::search(vector<string> indices)
65 {
66 start = clock();
67
68 const string XMLTYPE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" stan"
69 + string("dalone=\"no\"?>");
70 const string DOCTYPE = "<!DOCTYPE index SYSTEM \"index.dtd\">";
71
72 if (query.size() > 0)
73 {
74 for (int index = 0; index < indices.size(); index++)
75 {
76 string lock = indices[index] + ".lock";
77
78 ifstream fin(lock.c_str());
79
80 if (fin.is_open())
81 {
82 cerr << program << ": Index file is locked for indexing: "
83 << indices[index] << "\n";
84 fin.close();
85
86 continue;
87 }
88
89 fin.clear();
90 fin.open(indices[index].c_str());
91
92 if (!fin.is_open())
93 {
94 cerr << program << ": Could not open index file: "
95 << indices[index] << "\n";
96 fin.clear();
97
98 continue;
99 }
100
101 string line;
102
103 getline(fin, line);
104 if (line != XMLTYPE)
105 {
106 cerr << program << ": Invalid XML version declaration: "
107 << indices[index] << "\n";
108 fin.close();
109 fin.clear();
110
111 continue;
112 }
113
114 getline(fin, line);
115 if (line != DOCTYPE)
116 {
117 cerr << program << ": Invalid XML doctype: " << indices[index]
118 << "\n";
119 fin.close();
120 fin.clear();
121 continue;
122 }
123
124 getline(fin, line);
125 if (line != "<index>")
126 {
127 cerr << program << ": Invalid XML root element: "
128 << indices[index] << "\n";
129 fin.close();
130 fin.clear();
131 continue;
132 }
133
134 while (fin.good())
135 {
136 Page page;
137
138 fin >> page;
139
140 if (!page.empty()) search(page);
141 }
142
143 fin.close();
144 fin.clear();
145 }
146 }
147
148 finish = clock();
149 }
150
151 void Searcher::search(Page& page)
152 {
153 Ranker ranker(page);
154 ranker.rank(query);
155
156 if (ranker != 0)
157 {
158 ranker.setSample();
159 pages.insert(ranker);
160 }
161 }
162
163 void Searcher::setQuery()
164 {
165 string queryStringNormal = queryString;
166 normalize(queryStringNormal);
167
168 if (debug) cerr << "queryString = " << queryString << "\n"
169 << "queryStringNormal = " << queryStringNormal << "\n";
170
171 unsigned begin = 0;
172 bool capitalOr = false, quotes = false;
173 while (begin < queryStringNormal.length())
174 {
175 string word;
176
177 unsigned beginQuote = queryStringNormal.find('\"', begin);
178 unsigned endQuote = beginQuote != string::npos ?
179 queryStringNormal.find('\"', beginQuote + 1) : string::npos;
180 unsigned space = queryStringNormal.find(' ', begin);
181
182 if (beginQuote == begin)
183 {
184 quotes = true;
185 word = queryStringNormal.substr(beginQuote + 1, endQuote -
186 beginQuote - 1);
187 }
188 else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin, 8
189 ) == "intitle:")
190 {
191 quotes = true;
192 word = "intitle:" + queryStringNormal.substr(beginQuote + 1,
193 endQuote - beginQuote - 1);
194 beginQuote = begin;
195 }
196 else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin, 6
197 ) == "inurl:")
198 {
199 quotes = true;
200 word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote
201 - beginQuote - 1);
202 beginQuote = begin;
203 }
204 else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin, 7
205 ) == "intext:")
206 {
207 quotes = true;
208 word = "intext:" + queryStringNormal.substr(beginQuote + 1,
209 endQuote - beginQuote - 1);
210 beginQuote = begin;
211 }
212 else
213 {
214 quotes = false;
215 word = queryStringNormal.substr(begin, space - begin);
216 }
217
218 if (word != "")
219 {
220 string lowerWord(word.length(), ' ');
221
222 for (unsigned index = 0; index < word.length(); index++)
223 {
224 lowerWord[index] = tolower(word[index]);
225 }
226
227 if (debug) cerr << "word = " << word << "\n"
228 << "lowerWord = " << lowerWord << "\n";
229
230 if (word == "OR" && !quotes)
231 {
232 capitalOr = true;
233 query[query.size() - 1] += " OR";
234 }
235 else if (lowerWord == "or" && !quotes)
236 {
237 setOr(true);
238 }
239 else if (lowerWord == "and" && !quotes)
240 {
241 setAnd(true);
242 }
243 else if (common.find(lowerWord) != common.end() && !quotes)
244 {
245 commonUsed.push_back(word);
246 }
247 else
248 {
249 if (capitalOr) capitalOr = false;
250 if (query.size() < 10)
251 {
252 query.push_back(lowerWord);
253 }
254 else
255 {
256 setIgnore(lowerWord);
257
258 if (debug) cerr << "ignore = " << ignore << "\n";
259
260 break;
261 }
262 }
263 }
264
265 if (endQuote == string::npos && space == string::npos)
266 {
267 begin = string::npos;
268 }
269 else if (beginQuote == begin && endQuote != string::npos)
270 {
271 begin = endQuote + 1;
272 }
273 else
274 {
275 begin = space + 1;
276 }
277 }
278
279 if (debug)
280 {
281 cerr << "query = {\n";
282
283 for (unsigned index = 0; index < query.size(); index++)
284 {
285 cerr << " [" << index << "] = " << query[index] << "\n";
286 }
287
288 cerr << "}\n";
289 }
290 }
291
292 void Searcher::setCommon()
293 {
294 common.insert("&");
295 common.insert("a");
296 common.insert("about");
297 common.insert("an");
298 // "and" has its own special message
299 // common.insert("and");
300 common.insert("are");
301 common.insert("as");
302 common.insert("at");
303 common.insert("be");
304 common.insert("by");
305 common.insert("com");
306 common.insert("from");
307 common.insert("how");
308 common.insert("i");
309 common.insert("in");
310 common.insert("is");
311 common.insert("it");
312 common.insert("of");
313 common.insert("on");
314 // "or" has its own special message
315 // common.insert("or");
316 common.insert("that");
317 common.insert("the");
318 common.insert("this");
319 common.insert("to");
320 common.insert("was");
321 common.insert("what");
322 common.insert("when");
323 common.insert("where");
324 common.insert("which");
325 common.insert("who");
326 common.insert("why");
327 common.insert("will");
328 common.insert("with");
329 }