ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Searcher.cpp
Revision: 28
Committed: 2003-01-02T19:42:33-08:00 (22 years, 5 months ago) by douglas
File size: 7814 byte(s)
Log Message:
Changed Copyright notices to state 2002-2003.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Searcher
46 //
47 // Douglas Thrift
48 //
49 // Searcher.cpp
50
51 #include "Searcher.h"
52
53 Searcher::Searcher(string& queryString)
54 {
55 setCommon();
56 setAnd(false);
57 setOr(false);
58
59 this->queryString = queryString;
60
61 setQuery();
62 }
63
64 void Searcher::search(vector<string> indices)
65 {
66 start = clock();
67
68 const string XMLTYPE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" stan"
69 + string("dalone=\"no\"?>");
70 const string DOCTYPE = "<!DOCTYPE index SYSTEM \"index.dtd\">";
71
72 if (query.size() > 0)
73 {
74 for (int index = 0; index < indices.size(); index++)
75 {
76 ifstream fin(indices[index].c_str());
77
78 if (!fin.is_open())
79 {
80 cerr << program << ": Could not open index file: "
81 << indices[index] << "\n";
82 fin.clear();
83
84 continue;
85 }
86
87 string line;
88
89 getline(fin, line);
90 if (line != XMLTYPE)
91 {
92 cerr << program << ": Invalid XML version declaration: "
93 << indices[index] << "\n";
94 fin.close();
95 fin.clear();
96
97 continue;
98 }
99
100 getline(fin, line);
101 if (line != DOCTYPE)
102 {
103 cerr << program << ": Invalid XML doctype: " << indices[index]
104 << "\n";
105 fin.close();
106 fin.clear();
107 continue;
108 }
109
110 getline(fin, line);
111 if (line != "<index>")
112 {
113 cerr << program << ": Invalid XML root element: "
114 << indices[index] << "\n";
115 fin.close();
116 fin.clear();
117 continue;
118 }
119
120 while (fin.good())
121 {
122 Page page;
123
124 fin >> page;
125
126 if (!page.empty()) search(page);
127 }
128
129 fin.close();
130 fin.clear();
131 }
132 }
133
134 finish = clock();
135 }
136
137 void Searcher::search(Page& page)
138 {
139 Ranker ranker(page);
140 ranker.rank(query);
141
142 if (ranker != 0)
143 {
144 ranker.setSample();
145 pages.insert(ranker);
146 }
147 }
148
149 void Searcher::setQuery()
150 {
151 string queryStringNormal = queryString;
152 normalize(queryStringNormal);
153
154 if (debug) cerr << "queryString = " << queryString << "\n"
155 << "queryStringNormal = " << queryStringNormal << "\n";
156
157 unsigned begin = 0;
158 bool capitalOr = false, quotes = false;
159 while (begin < queryStringNormal.length())
160 {
161 string word;
162
163 unsigned beginQuote = queryStringNormal.find('\"', begin);
164 unsigned endQuote = beginQuote != string::npos ?
165 queryStringNormal.find('\"', beginQuote + 1) : string::npos;
166 unsigned space = queryStringNormal.find(' ', begin);
167
168 if (beginQuote == begin)
169 {
170 quotes = true;
171 word = queryStringNormal.substr(beginQuote + 1, endQuote -
172 beginQuote - 1);
173 }
174 else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin, 8
175 ) == "intitle:")
176 {
177 quotes = true;
178 word = "intitle:" + queryStringNormal.substr(beginQuote + 1,
179 endQuote - beginQuote - 1);
180 beginQuote = begin;
181 }
182 else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin, 6
183 ) == "inurl:")
184 {
185 quotes = true;
186 word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote
187 - beginQuote - 1);
188 beginQuote = begin;
189 }
190 else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin, 7
191 ) == "intext:")
192 {
193 quotes = true;
194 word = "intext:" + queryStringNormal.substr(beginQuote + 1,
195 endQuote - beginQuote - 1);
196 beginQuote = begin;
197 }
198 else
199 {
200 quotes = false;
201 word = queryStringNormal.substr(begin, space - begin);
202 }
203
204 if (word != "")
205 {
206 string lowerWord(word.length(), ' ');
207
208 for (unsigned index = 0; index < word.length(); index++)
209 {
210 lowerWord[index] = tolower(word[index]);
211 }
212
213 if (debug) cerr << "word = " << word << "\n"
214 << "lowerWord = " << lowerWord << "\n";
215
216 if (word == "OR" && !quotes)
217 {
218 capitalOr = true;
219 query[query.size() - 1] += " OR";
220 }
221 else if (lowerWord == "or" && !quotes)
222 {
223 setOr(true);
224 }
225 else if (lowerWord == "and" && !quotes)
226 {
227 setAnd(true);
228 }
229 else if (common.find(lowerWord) != common.end() && !quotes)
230 {
231 commonUsed.push_back(word);
232 }
233 else
234 {
235 if (capitalOr) capitalOr = false;
236 if (query.size() < 10)
237 {
238 query.push_back(lowerWord);
239 }
240 else
241 {
242 setIgnore(lowerWord);
243
244 if (debug) cerr << "ignore = " << ignore << "\n";
245
246 break;
247 }
248 }
249 }
250
251 if (endQuote == string::npos && space == string::npos)
252 {
253 begin = string::npos;
254 }
255 else if (beginQuote == begin && endQuote != string::npos)
256 {
257 begin = endQuote + 1;
258 }
259 else
260 {
261 begin = space + 1;
262 }
263 }
264
265 if (debug)
266 {
267 cerr << "query = {\n";
268
269 for (unsigned index = 0; index < query.size(); index++)
270 {
271 cerr << " [" << index << "] = " << query[index] << "\n";
272 }
273
274 cerr << "}\n";
275 }
276 }
277
278 void Searcher::setCommon()
279 {
280 common.insert("&");
281 common.insert("a");
282 common.insert("about");
283 common.insert("an");
284 // "and" has its own special message
285 // common.insert("and");
286 common.insert("are");
287 common.insert("as");
288 common.insert("at");
289 common.insert("be");
290 common.insert("by");
291 common.insert("com");
292 common.insert("from");
293 common.insert("how");
294 common.insert("i");
295 common.insert("in");
296 common.insert("is");
297 common.insert("it");
298 common.insert("of");
299 common.insert("on");
300 // "or" has its own special message
301 // common.insert("or");
302 common.insert("that");
303 common.insert("the");
304 common.insert("this");
305 common.insert("to");
306 common.insert("was");
307 common.insert("what");
308 common.insert("when");
309 common.insert("where");
310 common.insert("which");
311 common.insert("who");
312 common.insert("why");
313 common.insert("will");
314 common.insert("with");
315 }