ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Searcher.cpp
Revision: 1
Committed: 2002-12-04T20:22:59-08:00 (22 years, 6 months ago) by douglas
File size: 7809 byte(s)
Log Message:
Initial revision

File Contents

# User Rev Content
1 douglas 1 /* ============================================================================
2     * Douglas Thrift's Search Engine License
3     *
4     * Copyright (C) 2002, Douglas Thrift. All Rights Reserved.
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions are met:
7     *
8     * 1. Redistributions of source code must retain the above copyright notice,
9     * this list of conditions and the following disclaimer.
10     *
11     * 2. Redistributions in binary form must reproduce the above copyright notice,
12     * this list of conditions and the following disclaimer in the documentation
13     * and/or other materials provided with the distribution.
14     *
15     * 3. The end-user documentation included with the redistribution, if any, must
16     * include the following acknowledgment:
17     *
18     * "This product includes software developed by Douglas Thrift
19     * (http://computers.douglasthrift.net/searchengine/)."
20     *
21     * Alternately, this acknowledgment may appear in the software itself, if
22     * and wherever such third-party acknowledgments normally appear.
23     *
24     * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25     * be used to endorse or promote products derived from this software without
26     * specific prior written permission. For written permission, please visit
27     * http://www.douglasthrift.net/contact.cgi for contact information.
28     *
29     * 5. Products derived from this software may not be called "Douglas Thrift's
30     * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31     * name, without prior written permission.
32     *
33     * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34     * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35     * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38     * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39     * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40     * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41     * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42     * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43     * ============================================================================
44     */
45     // Douglas Thrift's Search Engine Searcher
46     //
47     // Douglas Thrift
48     //
49     // Searcher.cpp
50    
51     #include "Searcher.h"
52    
53     Searcher::Searcher(string& queryString)
54     {
55     setCommon();
56     setAnd(false);
57     setOr(false);
58    
59     this->queryString = queryString;
60    
61     setQuery();
62     }
63    
64     void Searcher::search(vector<string> indices)
65     {
66     start = clock();
67    
68     const string XMLTYPE = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" stan"
69     + string("dalone=\"no\"?>");
70     const string DOCTYPE = "<!DOCTYPE index SYSTEM \"index.dtd\">";
71    
72     if (query.size() > 0)
73     {
74     for (int index = 0; index < indices.size(); index++)
75     {
76     ifstream fin(indices[index].c_str());
77    
78     if (!fin.is_open())
79     {
80     cerr << program << ": Could not open index file: "
81     << indices[index] << "\n";
82     fin.clear();
83    
84     continue;
85     }
86    
87     string line;
88    
89     getline(fin, line);
90     if (line != XMLTYPE)
91     {
92     cerr << program << ": Invalid XML version declaration: "
93     << indices[index] << "\n";
94     fin.close();
95     fin.clear();
96    
97     continue;
98     }
99    
100     getline(fin, line);
101     if (line != DOCTYPE)
102     {
103     cerr << program << ": Invalid XML doctype: " << indices[index]
104     << "\n";
105     fin.close();
106     fin.clear();
107     continue;
108     }
109    
110     getline(fin, line);
111     if (line != "<index>")
112     {
113     cerr << program << ": Invalid XML root element: "
114     << indices[index] << "\n";
115     fin.close();
116     fin.clear();
117     continue;
118     }
119    
120     while (fin.good())
121     {
122     Page page;
123    
124     fin >> page;
125    
126     if (!page.empty()) search(page);
127     }
128    
129     fin.close();
130     fin.clear();
131     }
132     }
133    
134     finish = clock();
135     }
136    
137     void Searcher::search(Page& page)
138     {
139     Ranker ranker(page);
140     ranker.rank(query);
141    
142     if (ranker != 0)
143     {
144     ranker.setSample();
145     pages.insert(ranker);
146     }
147     }
148    
149     void Searcher::setQuery()
150     {
151     string queryStringNormal = queryString;
152     normalize(queryStringNormal);
153    
154     if (debug) cerr << "queryString = " << queryString << "\n"
155     << "queryStringNormal = " << queryStringNormal << "\n";
156    
157     unsigned begin = 0;
158     bool capitalOr = false, quotes = false;
159     while (begin < queryStringNormal.length())
160     {
161     string word;
162    
163     unsigned beginQuote = queryStringNormal.find('\"', begin);
164     unsigned endQuote = beginQuote != string::npos ?
165     queryStringNormal.find('\"', beginQuote + 1) : string::npos;
166     unsigned space = queryStringNormal.find(' ', begin);
167    
168     if (beginQuote == begin)
169     {
170     quotes = true;
171     word = queryStringNormal.substr(beginQuote + 1, endQuote -
172     beginQuote - 1);
173     }
174     else if (beginQuote == (begin + 8) && queryStringNormal.substr(begin, 8
175     ) == "intitle:")
176     {
177     quotes = true;
178     word = "intitle:" + queryStringNormal.substr(beginQuote + 1,
179     endQuote - beginQuote - 1);
180     beginQuote = begin;
181     }
182     else if (beginQuote == (begin + 6) && queryStringNormal.substr(begin, 6
183     ) == "inurl:")
184     {
185     quotes = true;
186     word = "inurl:" + queryStringNormal.substr(beginQuote + 1, endQuote
187     - beginQuote - 1);
188     beginQuote = begin;
189     }
190     else if (beginQuote == (begin + 7) && queryStringNormal.substr(begin, 7
191     ) == "intext:")
192     {
193     quotes = true;
194     word = "intext:" + queryStringNormal.substr(beginQuote + 1,
195     endQuote - beginQuote - 1);
196     beginQuote = begin;
197     }
198     else
199     {
200     quotes = false;
201     word = queryStringNormal.substr(begin, space - begin);
202     }
203    
204     if (word != "")
205     {
206     string lowerWord(word.length(), ' ');
207    
208     for (unsigned index = 0; index < word.length(); index++)
209     {
210     lowerWord[index] = tolower(word[index]);
211     }
212    
213     if (debug) cerr << "word = " << word << "\n"
214     << "lowerWord = " << lowerWord << "\n";
215    
216     if (word == "OR" && !quotes)
217     {
218     capitalOr = true;
219     query[query.size() - 1] += " OR";
220     }
221     else if (lowerWord == "or" && !quotes)
222     {
223     setOr(true);
224     }
225     else if (lowerWord == "and" && !quotes)
226     {
227     setAnd(true);
228     }
229     else if (common.find(lowerWord) != common.end() && !quotes)
230     {
231     commonUsed.push_back(word);
232     }
233     else
234     {
235     if (capitalOr) capitalOr = false;
236     if (query.size() < 10)
237     {
238     query.push_back(lowerWord);
239     }
240     else
241     {
242     setIgnore(lowerWord);
243    
244     if (debug) cerr << "ignore = " << ignore << "\n";
245    
246     break;
247     }
248     }
249     }
250    
251     if (endQuote == string::npos && space == string::npos)
252     {
253     begin = string::npos;
254     }
255     else if (beginQuote == begin && endQuote != string::npos)
256     {
257     begin = endQuote + 1;
258     }
259     else
260     {
261     begin = space + 1;
262     }
263     }
264    
265     if (debug)
266     {
267     cerr << "query = {\n";
268    
269     for (unsigned index = 0; index < query.size(); index++)
270     {
271     cerr << " [" << index << "] = " << query[index] << "\n";
272     }
273    
274     cerr << "}\n";
275     }
276     }
277    
278     void Searcher::setCommon()
279     {
280     common.insert("&");
281     common.insert("a");
282     common.insert("about");
283     common.insert("an");
284     // "and" has its own special message
285     // common.insert("and");
286     common.insert("are");
287     common.insert("as");
288     common.insert("at");
289     common.insert("be");
290     common.insert("by");
291     common.insert("com");
292     common.insert("from");
293     common.insert("how");
294     common.insert("i");
295     common.insert("in");
296     common.insert("is");
297     common.insert("it");
298     common.insert("of");
299     common.insert("on");
300     // "or" has its own special message
301     // common.insert("or");
302     common.insert("that");
303     common.insert("the");
304     common.insert("this");
305     common.insert("to");
306     common.insert("was");
307     common.insert("what");
308     common.insert("when");
309     common.insert("where");
310     common.insert("which");
311     common.insert("who");
312     common.insert("why");
313     common.insert("will");
314     common.insert("with");
315     }