ViewVC Help
View File | Revision Log | Show Annotations | Download File | View Changeset | Root Listing
root/proj/trunk/Search/Search.cpp
Revision: 35
Committed: 2003-01-16T17:27:03-08:00 (22 years, 5 months ago) by douglas
File size: 13935 byte(s)
Log Message:
Embedded DTD into the program and removed it from tree.
Added usage() to bad argument handlers.

File Contents

# Content
1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Main
46 //
47 // Douglas Thrift
48 //
49 // Search.cpp
50
51 #include "Search.h"
52 #include "Indexer.h"
53 #include "Searcher.h"
54 #include "Outputer.h"
55
56 #ifndef _WIN32
57 #include <sys/utsname.h>
58 #else
59 #include <windows.h>
60 #endif // _WIN32
61
62 string program;
63 string programName = "Douglas Thrift's Search Engine";
64 string programVersion = "1.1alpha2";
65 bool debug = false;
66
67 int main(int argc, char* argv[])
68 {
69 program = argv[0];
70
71 bool indexMode = false;
72 string indexURL;
73 set<string> indexDomains;
74 set<string> indexRestrictions;
75
76 unsigned page = 1;
77 string query;
78
79 vector<string> indices;
80
81 string header = "header.html";
82 string body = "body.html";
83 string footer = "footer.html";
84 string notfound = "notfound.html";
85 string pages = "pages.html";
86
87 for (int index = 1; index < argc; index++)
88 {
89 string arg(argv[index]);
90
91 if (arg == "-help")
92 {
93 usage();
94 return 0;
95 }
96 else if (arg == "-version")
97 {
98 version();
99 return 0;
100 }
101 else if (arg == "-license")
102 {
103 license();
104 return 0;
105 }
106 else if (arg == "-P")
107 {
108 if (++index < argc)
109 {
110 page = strtoul(argv[index],0,0);
111 }
112 else
113 {
114 cerr << program << ": Bad arguments\n";
115 usage();
116 return 1;
117 }
118 }
119 else if (arg == "-i")
120 {
121 indexMode = true;
122
123 if (++index < argc)
124 {
125 indexURL = argv[index];
126 }
127 else
128 {
129 cerr << program << ": Bad arguments\n";
130 usage();
131 return 1;
132 }
133 }
134 else if (arg == "-d")
135 {
136 if (++index < argc)
137 {
138 indexDomains.insert(argv[index]);
139 }
140 else
141 {
142 cerr << program << ": Bad arguments\n";
143 usage();
144 return 1;
145 }
146 }
147 else if (arg == "-r")
148 {
149 if (++index < argc)
150 {
151 indexRestrictions.insert(argv[index]);
152 }
153 else
154 {
155 cerr << program << ": Bad arguments\n";
156 usage();
157 return 1;
158 }
159 }
160 else if (arg == "-h")
161 {
162 if (++index < argc)
163 {
164 header = argv[index];
165 }
166 else
167 {
168 cerr << program << ": Bad arguments\n";
169 usage();
170 return 1;
171 }
172 }
173 else if (arg == "-b")
174 {
175 if (++index < argc)
176 {
177 body = argv[index];
178 }
179 else
180 {
181 cerr << program << ": Bad arguments\n";
182 usage();
183 return 1;
184 }
185 }
186 else if (arg == "-f")
187 {
188 if (++index < argc)
189 {
190 footer = argv[index];
191 }
192 else
193 {
194 cerr << program << ": Bad arguments\n";
195 usage();
196 return 1;
197 }
198 }
199 else if (arg == "-n")
200 {
201 if (++index < argc)
202 {
203 notfound = argv[index];
204 }
205 else
206 {
207 cerr << program << ": Bad arguments\n";
208 usage();
209 return 1;
210 }
211 }
212 else if (arg == "-p")
213 {
214 if (++index < argc)
215 {
216 pages = argv[index];
217 }
218 else
219 {
220 cerr << program << ": Bad arguments\n";
221 usage();
222 return 1;
223 }
224 }
225 else if (arg == "-e")
226 {
227 if (++index < argc)
228 {
229 // set from email
230 }
231 else
232 {
233 cerr << program << ": Bad arguments\n";
234 usage();
235 return 1;
236 }
237 }
238 else if (arg == "-D")
239 {
240 debug = true;
241 }
242 else
243 {
244 indices.push_back(arg);
245 }
246 }
247
248 if (indices.size() < 1)
249 {
250 usage();
251 return 0;
252 }
253
254 if (indexMode)
255 {
256 if (indices.size() > 1)
257 {
258 cerr << program << ": Too many indices, can only build one index"
259 << " at a time\n";
260 return 1;
261 }
262
263 if (indexDomains.size() < 1)
264 {
265 cerr << program << ": Must specify at least one domain\n";
266 return 1;
267 }
268
269 Indexer indexer(indices[0], indexDomains, indexRestrictions);
270
271 indexer.index(indexURL);
272 }
273 else
274 {
275 string line;
276 getline(cin, line);
277 query = line;
278
279 Searcher searcher(query);
280
281 searcher.search(indices);
282
283 Outputer outputer(header, body, footer, notfound,
284 pages);
285
286 outputer.output(searcher, page < 1 ? page : --page);
287 }
288
289 return 0;
290 }
291
292 string agent(bool version)
293 {
294 string agent = programName + (version ? ('/' + programVersion) : "");
295
296 return agent;
297 }
298
299 string platform()
300 {
301 string platform;
302 string os;
303 string version;
304 string architecture;
305 string marketing;
306
307 #ifdef _WIN32
308 OSVERSIONINFO* computer = new OSVERSIONINFO;
309 computer->dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
310 GetVersionEx(computer);
311
312 os = computer->dwPlatformId == VER_PLATFORM_WIN32_NT ? "Windows NT" :
313 "Windows";
314 unsigned major = computer->dwMajorVersion;
315 unsigned minor = computer->dwMinorVersion;
316
317 delete computer;
318
319 SYSTEM_INFO* system = new SYSTEM_INFO;
320 GetSystemInfo(system);
321
322 switch (system->wProcessorArchitecture)
323 {
324 case PROCESSOR_ARCHITECTURE_INTEL:
325 architecture = "ix86";
326 break;
327 case PROCESSOR_ARCHITECTURE_MIPS:
328 architecture = "mips";
329 break;
330 case PROCESSOR_ARCHITECTURE_ALPHA:
331 architecture = "alpha";
332 break;
333 case PROCESSOR_ARCHITECTURE_PPC:
334 architecture = "ppc";
335 break;
336 case PROCESSOR_ARCHITECTURE_IA64:
337 architecture = "ia64";
338 break;
339 case PROCESSOR_ARCHITECTURE_IA32_ON_WIN64:
340 architecture = "ix86_on_win64";
341 break;
342 case PROCESSOR_ARCHITECTURE_AMD64:
343 architecture = "amd64";
344 break;
345 default:
346 architecture = "unknown";
347 break;
348 }
349
350 char* cversion = new char[1024];
351 sprintf(cversion, "%u.%u", major, minor);
352 version = cversion;
353
354 delete [] cversion;
355
356 if (major == 4 && minor <= 3 && os != "Windows NT")
357 {
358 marketing = " [Windows 95]";
359 }
360 else if (major == 4 && minor == 10 && os != "Windows NT")
361 {
362 marketing = " [Windows 98]";
363 }
364 else if (major == 5 && minor == 0 && os == "Windows NT")
365 {
366 marketing = " [Windows 2000]";
367 }
368 else if (major == 4 && minor == 90 && os != "Windows NT")
369 {
370 marketing = " [Windows ME]";
371 }
372 else if (major == 5 && minor == 1 && os == "Windows NT")
373 {
374 marketing = " [Windows XP]";
375 }
376 else if (major == 5 && minor == 2 && os == "Windows NT")
377 {
378 marketing = " [Windows .NET Server]";
379 }
380 #else // _WIN32
381 struct utsname* computer = new struct utsname;
382 uname(computer);
383
384 os = computer->sysname;
385 version = computer->release;
386 architecture = computer->machine;
387
388 delete computer;
389 #endif // _WIN32
390
391 platform = "(" + os + " " + version + marketing + " " + architecture + ")";
392
393 return platform;
394 }
395
396 void usage()
397 {
398 string tab(8 + program.length(), ' ');
399
400 cout << "Usage: " << program << " [index ...] [-P page] [-h header] [-b bo"
401 << "dy]\n"
402 << tab << "[-f footer] [-n notfound] [-p pages]\n"
403 << tab << "[-i begin] [-d domain ...] [-r restriction ...]\n"
404 << tab << "[-D] [-version] [-help]\n"
405 << "Options:\n"
406 << " index Index file to use (can only use one file for i"
407 << "ndexing)\n"
408 << " -P page Page of search to display (defaults to 1)\n"
409 << " -h header Header template to use (defaults to header.htm"
410 << "l)\n"
411 << " -b body Body template to use (defaults to body.html)\n"
412 << " -f footer Footer template to use (defaults to footer.htm"
413 << "l)\n"
414 << " -n notfound Not found template to use (defaults to notfoun"
415 << "d.html)\n"
416 << " -p pages Pages template to use (defaults to pages.html)"
417 << "\n"
418 << " -i begin URL to begin indexing (causes indexing rather "
419 << "than search)\n"
420 << " -d domain Domain to include in indexing\n"
421 << " -r restriction URL to restrict from indexing\n"
422 << " -D Display debug information\n"
423 << " -version Display version information and exit\n"
424 << " -license Display license information and exit\n"
425 << " -help Display this message and exit\n";
426 }
427
428 void version()
429 {
430 cout << programName << " " << programVersion << " "<< platform() << "\n\n"
431 << " Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.\n"
432 << "\n"
433 << " This product includes software developed by Douglas Thrift\n"
434 << " (http://computers.douglasthrift.net/searchengine/).\n";
435 }
436
437 void license()
438 {
439 cout << "License:\n"
440 << " Douglas Thrift's Search Engine License\n\n"
441 << " Copyright (C) 2002-2003, Douglas Thrift. All Rights Reserved.\n"
442 << "\n"
443 << " Redistribution and use in source and binary forms, with or with"
444 << "out\n"
445 << " modification, are permitted provided that the following conditi"
446 << "ons are met:\n\n"
447 << " 1. Redistributions of source code must retain the above copyrig"
448 << "ht notice,\n"
449 << " this list of conditions and the following disclaimer.\n\n"
450 << " 2. Redistributions in binary form must reproduce the above copy"
451 << "right notice,\n"
452 << " this list of conditions and the following disclaimer in the "
453 << "documentation\n"
454 << " and/or other materials provided with the distribution.\n\n"
455 << " 3. The end-user documentation included with the redistribution,"
456 << " if any, must\n"
457 << " include the following acknowledgment:\n\n"
458 << " \"This product includes software developed by Douglas Thr"
459 << "ift\n"
460 << " (http://computers.douglasthrift.net/searchengine/).\"\n\n"
461 << " Alternately, this acknowledgment may appear in the software "
462 << "itself, if\n"
463 << " and wherever such third-party acknowledgments normally appea"
464 << "r.\n\n"
465 << " 4. The names \"Douglas Thrift\" and \"Douglas Thrift\'s Search "
466 << "Engine\" must not\n"
467 << " be used to endorse or promote products derived from this sof"
468 << "tware without\n"
469 << " specific prior written permission. For written permission, p"
470 << "lease visit\n"
471 << " http://www.douglasthrift.net/contact.cgi for contact inform"
472 << "ation.\n\n"
473 << " 5. Products derived from this software may not be called \"Doug"
474 << "las Thrift\'s\n"
475 << " Search Engine\", nor may \"Douglas Thrift\'s Search Engine\""
476 << " appear in their\n"
477 << " name, without prior written permission.\n\n"
478 << " THIS SOFTWARE IS PROVIDED \"AS IS\" AND ANY EXPRESS OR IMPLIED "
479 << "WARRANTIES,\n"
480 << " INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHA"
481 << "NTABILITY AND\n"
482 << " FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SH"
483 << "ALL THE\n"
484 << " COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIR"
485 << "ECT,\n"
486 << " INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU"
487 << "DING, BUT NOT\n"
488 << " LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS O"
489 << "F USE, DATA,\n"
490 << " OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY"
491 << " THEORY OF\n"
492 << " LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCL"
493 << "UDING\n"
494 << " NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF T"
495 << "HIS SOFTWARE,\n"
496 << " EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n";
497 }
498
499 void entities(string& line, char character, char* entity)
500 {
501 int begin = 0;
502
503 while (begin < line.length())
504 {
505 int spot = line.find(character, begin);
506
507 int end = spot + 1;
508
509 if (spot != string::npos)
510 {
511 line.replace(spot, 1, entity);
512 }
513 else
514 {
515 break;
516 }
517
518 begin = end;
519 }
520 }
521
522 void entities(string& line, char* entity, char character)
523 {
524 int begin = 0;
525
526 while (begin < line.length())
527 {
528 int spot = line.find(entity, begin);
529
530 int end = spot + 1;
531
532 if (spot != string::npos)
533 {
534 line.replace(spot, strlen(entity), 1, character);
535 }
536 else
537 {
538 break;
539 }
540
541 begin = end;
542 }
543 }
544
545 void normalize(string& abbynormal)
546 {
547 for (unsigned index = 0; index < abbynormal.length(); index++)
548 {
549 if (isspace(abbynormal[index]))
550 {
551 unsigned next = index + 1;
552 while (isspace(abbynormal[next]))
553 {
554 next++;
555 }
556 abbynormal.replace(index, next - index, 1, abbynormal[index]);
557 }
558 }
559
560 if (isspace(abbynormal[0])) abbynormal.erase(0, 1);
561 if (isspace(abbynormal[abbynormal.length() - 1]))
562 abbynormal.erase(abbynormal.length() - 1, 1);
563 }