ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Searcher.cpp
Revision: 348
Committed: 2004-05-26T17:44:17-07:00 (21 years ago) by Douglas Thrift
File size: 7907 byte(s)
Log Message:
Lots of C++ifying!

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Searcher
46 //
47 // Douglas Thrift
48 //
49 // $Id$
50
51 #include "Searcher.hpp"
52
53 Searcher::Searcher(string& queryString) : and_(false), or_(false),
54 queryString(queryString)
55 {
56 setCommon();
57 setQuery();
58 }
59
60 void Searcher::search(vector<string> indices)
61 {
62 start = clock();
63
64 const string XMLTYPE("<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standa"
65 + string("lone=\"no\"?>")), DOCTYPE("<!DOCTYPE index SYSTEM \"index.dt"
66 + string("d\">"));
67
68 if (query.size() > 0)
69 {
70 for (size_t index(0); index < indices.size(); index++)
71 {
72 string lock(indices[index] + ".lock");
73 ifstream fin(lock.c_str());
74
75 if (fin.is_open())
76 {
77 cerr << program << ": Index file locked for indexing: "
78 << indices[index] << "\n";
79
80 fin.close();
81
82 continue;
83 }
84
85 fin.clear();
86 fin.open(indices[index].c_str());
87
88 if (!fin.is_open())
89 {
90 cerr << program << ": Could not open index file: "
91 << indices[index] << "\n";
92
93 fin.clear();
94
95 continue;
96 }
97
98 string line;
99
100 getline(fin, line);
101
102 if (line != XMLTYPE)
103 {
104 cerr << program << ": Invalid XML version declaration: "
105 << indices[index] << "\n";
106
107 fin.close();
108 fin.clear();
109
110 continue;
111 }
112
113 getline(fin, line);
114
115 if (line != DOCTYPE)
116 {
117 cerr << program << ": Invalid XML doctype: " << indices[index]
118 << "\n";
119
120 fin.close();
121 fin.clear();
122
123 continue;
124 }
125
126 getline(fin, line);
127
128 if (line != "<index>")
129 {
130 cerr << program << ": Invalid XML root element: "
131 << indices[index] << "\n";
132
133 fin.close();
134 fin.clear();
135
136 continue;
137 }
138
139 while (fin.good())
140 {
141 Page page;
142
143 fin >> page;
144
145 if (!page.empty()) search(page);
146 }
147
148 fin.close();
149 fin.clear();
150 }
151 }
152
153 finish = clock();
154 }
155
156 void Searcher::search(Page& page)
157 {
158 Ranker ranker(page);
159
160 ranker.rank(query);
161
162 if (ranker != 0)
163 {
164 ranker.setSample();
165 pages.insert(ranker);
166 }
167 }
168
169 void Searcher::setQuery()
170 {
171 string queryStringNormal(queryString);
172
173 normalize(queryStringNormal);
174
175 if (debug) cerr << "queryString = " << queryString << "\n"
176 << "queryStringNormal = " << queryStringNormal << "\n";
177
178 unsigned begin(0);
179 bool capitalOr(false), quotes(false);
180
181 while (begin < queryStringNormal.length())
182 {
183 string word;
184 unsigned beginQuote(queryStringNormal.find('\"', begin)),
185 endQuote(beginQuote != string::npos ? queryStringNormal.find('\"',
186 beginQuote + 1) : string::npos), space(queryStringNormal.find(' ',
187 begin));
188
189 if (beginQuote == begin)
190 {
191 quotes = true;
192 word = queryStringNormal.substr(beginQuote + 1, endQuote -
193 beginQuote - 1);
194 }
195 else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin,
196 8) == "intitle:")
197 {
198 quotes = true;
199 word = "intitle:" + queryStringNormal.substr(beginQuote + 1,
200 endQuote - beginQuote - 1);
201 beginQuote = begin;
202 }
203 else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin,
204 6) == "inurl:")
205 {
206 quotes = true;
207 word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote
208 - beginQuote - 1);
209 beginQuote = begin;
210 }
211 else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin,
212 7) == "intext:")
213 {
214 quotes = true;
215 word = "intext:" + queryStringNormal.substr(beginQuote + 1,
216 endQuote - beginQuote - 1);
217 beginQuote = begin;
218 }
219 else
220 {
221 quotes = false;
222 word = queryStringNormal.substr(begin, space - begin);
223 }
224
225 if (word != "")
226 {
227 string lowerWord(tolower(word));
228
229 if (debug) cerr << "word = " << word << "\n"
230 << "lowerWord = " << lowerWord << "\n";
231
232 if (word == "OR" && !quotes)
233 {
234 capitalOr = true;
235 query[query.size() - 1] += " OR";
236 }
237 else if (lowerWord == "or" && !quotes)
238 {
239 setOr(true);
240 }
241 else if (lowerWord == "and" && !quotes)
242 {
243 setAnd(true);
244 }
245 else if (common.find(lowerWord) != common.end() && !quotes)
246 {
247 commonUsed.push_back(word);
248 }
249 else
250 {
251 if (capitalOr) capitalOr = false;
252
253 if (query.size() < 10)
254 {
255 query.push_back(lowerWord);
256 }
257 else
258 {
259 setIgnore(lowerWord);
260
261 if (debug) cerr << "ignore = " << ignore << "\n";
262
263 break;
264 }
265 }
266 }
267
268 if (endQuote == string::npos && space == string::npos)
269 {
270 begin = string::npos;
271 }
272 else if (beginQuote == begin && endQuote != string::npos)
273 {
274 begin = endQuote + 1;
275 }
276 else
277 {
278 begin = space + 1;
279 }
280 }
281
282 if (debug)
283 {
284 cerr << "query = {\n";
285
286 for (unsigned index(0); index < query.size(); index++)
287 {
288 cerr << " [" << index << "] = " << query[index] << "\n";
289 }
290
291 cerr << "}\n";
292 }
293 }
294
295 void Searcher::setCommon()
296 {
297 common.insert("&");
298 common.insert("a");
299 common.insert("about");
300 common.insert("an");
301 // "and" has its own special message
302 // common.insert("and");
303 common.insert("are");
304 common.insert("as");
305 common.insert("at");
306 common.insert("be");
307 common.insert("by");
308 common.insert("com");
309 common.insert("from");
310 common.insert("how");
311 common.insert("i");
312 common.insert("in");
313 common.insert("is");
314 common.insert("it");
315 common.insert("of");
316 common.insert("on");
317 // "or" has its own special message
318 // common.insert("or");
319 common.insert("that");
320 common.insert("the");
321 common.insert("this");
322 common.insert("to");
323 common.insert("was");
324 common.insert("what");
325 common.insert("when");
326 common.insert("where");
327 common.insert("which");
328 common.insert("who");
329 common.insert("why");
330 common.insert("will");
331 common.insert("with");
332 }

Properties

Name Value
svn:eol-style native
svn:keywords Id