ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Searcher.cpp
Revision: 312
Committed: 2004-01-01T15:00:34-08:00 (21 years, 5 months ago) by douglas
File size: 7987 byte(s)
Log Message:
Updated copyright years.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Searcher
46 //
47 // Douglas Thrift
48 //
49 // $Id: Searcher.cpp,v 1.7 2004/01/01 23:00:34 douglas Exp $
50
51 #include "Searcher.h"
52
53 Searcher::Searcher(string& queryString)
54 {
55 setCommon();
56 setAnd(false);
57 setOr(false);
58
59 this->queryString = queryString;
60
61 setQuery();
62 }
63
64 void Searcher::search(vector<string> indices)
65 {
66 start = clock();
67
68 const string XMLTYPE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" stan"
69 + string("dalone=\"no\"?>");
70 const string DOCTYPE = "<!DOCTYPE index SYSTEM \"index.dtd\">";
71
72 if (query.size() > 0)
73 {
74 for (int index = 0; index < indices.size(); index++)
75 {
76 string lock = indices[index] + ".lock";
77
78 ifstream fin(lock.c_str());
79
80 if (fin.is_open())
81 {
82 cerr << program << ": Index file locked for indexing: "
83 << indices[index] << "\n";
84 fin.close();
85
86 continue;
87 }
88
89 fin.clear();
90 fin.open(indices[index].c_str());
91
92 if (!fin.is_open())
93 {
94 cerr << program << ": Could not open index file: "
95 << indices[index] << "\n";
96 fin.clear();
97
98 continue;
99 }
100
101 string line;
102
103 getline(fin, line);
104 if (line != XMLTYPE)
105 {
106 cerr << program << ": Invalid XML version declaration: "
107 << indices[index] << "\n";
108 fin.close();
109 fin.clear();
110
111 continue;
112 }
113
114 getline(fin, line);
115 if (line != DOCTYPE)
116 {
117 cerr << program << ": Invalid XML doctype: " << indices[index]
118 << "\n";
119 fin.close();
120 fin.clear();
121 continue;
122 }
123
124 getline(fin, line);
125 if (line != "<index>")
126 {
127 cerr << program << ": Invalid XML root element: "
128 << indices[index] << "\n";
129 fin.close();
130 fin.clear();
131 continue;
132 }
133
134 while (fin.good())
135 {
136 Page page;
137
138 fin >> page;
139
140 if (!page.empty()) search(page);
141 }
142
143 fin.close();
144 fin.clear();
145 }
146 }
147
148 finish = clock();
149 }
150
151 void Searcher::search(Page& page)
152 {
153 Ranker ranker(page);
154 ranker.rank(query);
155
156 if (ranker != 0)
157 {
158 ranker.setSample();
159 pages.insert(ranker);
160 }
161 }
162
163 void Searcher::setQuery()
164 {
165 string queryStringNormal = queryString;
166 normalize(queryStringNormal);
167
168 if (debug) cerr << "queryString = " << queryString << "\n"
169 << "queryStringNormal = " << queryStringNormal << "\n";
170
171 unsigned begin = 0;
172 bool capitalOr = false, quotes = false;
173 while (begin < queryStringNormal.length())
174 {
175 string word;
176
177 unsigned beginQuote = queryStringNormal.find('\"', begin);
178 unsigned endQuote = beginQuote != string::npos ?
179 queryStringNormal.find('\"', beginQuote + 1) : string::npos;
180 unsigned space = queryStringNormal.find(' ', begin);
181
182 if (beginQuote == begin)
183 {
184 quotes = true;
185 word = queryStringNormal.substr(beginQuote + 1, endQuote -
186 beginQuote - 1);
187 }
188 else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin, 8
189 ) == "intitle:")
190 {
191 quotes = true;
192 word = "intitle:" + queryStringNormal.substr(beginQuote + 1,
193 endQuote - beginQuote - 1);
194 beginQuote = begin;
195 }
196 else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin, 6
197 ) == "inurl:")
198 {
199 quotes = true;
200 word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote
201 - beginQuote - 1);
202 beginQuote = begin;
203 }
204 else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin, 7
205 ) == "intext:")
206 {
207 quotes = true;
208 word = "intext:" + queryStringNormal.substr(beginQuote + 1,
209 endQuote - beginQuote - 1);
210 beginQuote = begin;
211 }
212 else
213 {
214 quotes = false;
215 word = queryStringNormal.substr(begin, space - begin);
216 }
217
218 if (word != "")
219 {
220 string lowerWord = tolower(word);
221
222 if (debug) cerr << "word = " << word << "\n"
223 << "lowerWord = " << lowerWord << "\n";
224
225 if (word == "OR" && !quotes)
226 {
227 capitalOr = true;
228 query[query.size() - 1] += " OR";
229 }
230 else if (lowerWord == "or" && !quotes)
231 {
232 setOr(true);
233 }
234 else if (lowerWord == "and" && !quotes)
235 {
236 setAnd(true);
237 }
238 else if (common.find(lowerWord) != common.end() && !quotes)
239 {
240 commonUsed.push_back(word);
241 }
242 else
243 {
244 if (capitalOr) capitalOr = false;
245 if (query.size() < 10)
246 {
247 query.push_back(lowerWord);
248 }
249 else
250 {
251 setIgnore(lowerWord);
252
253 if (debug) cerr << "ignore = " << ignore << "\n";
254
255 break;
256 }
257 }
258 }
259
260 if (endQuote == string::npos && space == string::npos)
261 {
262 begin = string::npos;
263 }
264 else if (beginQuote == begin && endQuote != string::npos)
265 {
266 begin = endQuote + 1;
267 }
268 else
269 {
270 begin = space + 1;
271 }
272 }
273
274 if (debug)
275 {
276 cerr << "query = {\n";
277
278 for (unsigned index = 0; index < query.size(); index++)
279 {
280 cerr << " [" << index << "] = " << query[index] << "\n";
281 }
282
283 cerr << "}\n";
284 }
285 }
286
287 void Searcher::setCommon()
288 {
289 common.insert("&");
290 common.insert("a");
291 common.insert("about");
292 common.insert("an");
293 // "and" has its own special message
294 // common.insert("and");
295 common.insert("are");
296 common.insert("as");
297 common.insert("at");
298 common.insert("be");
299 common.insert("by");
300 common.insert("com");
301 common.insert("from");
302 common.insert("how");
303 common.insert("i");
304 common.insert("in");
305 common.insert("is");
306 common.insert("it");
307 common.insert("of");
308 common.insert("on");
309 // "or" has its own special message
310 // common.insert("or");
311 common.insert("that");
312 common.insert("the");
313 common.insert("this");
314 common.insert("to");
315 common.insert("was");
316 common.insert("what");
317 common.insert("when");
318 common.insert("where");
319 common.insert("which");
320 common.insert("who");
321 common.insert("why");
322 common.insert("will");
323 common.insert("with");
324 }