// Site Mapper // // Douglas Thrift // // $Id$ #ifdef _WIN32 #pragma warning(disable:4503) #endif #include "SiteMapper.hpp" #include "Matcher.hpp" #include "Page.hpp" string program; bool debug = false; int main(int argc, char* argv[]) { program = argv[0]; string siteIndex, siteMap; for (int index = 1; index < argc; index++) { string arg(argv[index]); Matcher matcher; if (arg == matcher("^-index=(.*)$")) { siteIndex = matcher[1]; } else if (arg == matcher("^-map=(.*)$")) { siteMap = matcher[1]; } else if (arg == "-D") { if (!debug) debug = true; } } if (siteIndex != "" && siteMap != "") { XMLPlatformUtils::Initialize(); XPathEvaluator::initialize(); SiteMapper mapper(siteIndex, siteMap); XPathEvaluator::terminate(); XMLPlatformUtils::Terminate(); } else { cout << "Usage: " << program << " -index=index -map=map [-D]\n"; } return 0; } SiteMapper::SiteMapper(const string& siteIndex, const string& siteMap) { oldMap(siteMap); newIndex(siteIndex); newMap(siteMap); } void SiteMapper::oldMap(const string& siteMap) { support.setParserLiaison(&liaison); XalanDOMString file(siteMap.c_str()); LocalFileInputSource source(file.c_str()); XalanDocument* document = liaison.parseXMLStream(source); if (document == 0) return; XalanNode* list = evaluator.selectSingleNode(support, document, XalanDOMString("/page/section/list").c_str()); if (list == 0) return; item = evaluator.createXPath(XalanDOMString("item").c_str()); address = evaluator.createXPath(XalanDOMString("link/@address").c_str()); link = evaluator.createXPath(XalanDOMString("link").c_str()); this->list = evaluator.createXPath(XalanDOMString("list").c_str()); oldMap(pages, list); evaluator.destroyXPath(item); evaluator.destroyXPath(address); evaluator.destroyXPath(link); evaluator.destroyXPath(this->list); } void SiteMapper::oldMap(vector& pages, XalanNode* list) { NodeRefList nodes = evaluator.selectNodeList(support, list, *item); for (int index = 0; index < nodes.getLength(); index++) { XalanNode* node = nodes.item(index); ostringstream url, title; url << evaluator.evaluate(support, node, *address)->str(); title << evaluator.evaluate(support, node, *link)->str(); Page page(url.str(), title.str()); XalanNode* list = evaluator.selectSingleNode(support, node, *(this->list)); if (list != 0) oldMap(page.getChildren(), list); pages.push_back(page); } } void SiteMapper::newIndex(const string& siteIndex) { XalanDOMString file(siteIndex.c_str()); LocalFileInputSource source(file.c_str()); XalanDocument* document = liaison.parseXMLStream(source); if (document == 0) return; address = evaluator.createXPath(XalanDOMString("address").c_str()); port = evaluator.createXPath(XalanDOMString("port").c_str()); path = evaluator.createXPath(XalanDOMString("path").c_str()); title = evaluator.createXPath(XalanDOMString("title").c_str()); NodeRefList nodes = evaluator.selectNodeList(support, document, XalanDOMString("/index/page").c_str()); for (int index = 0; index < nodes.getLength(); index++) { XalanNode* node = nodes.item(index); ostringstream address; address << evaluator.evaluate(support, node, *(this->address))->str(); double port = evaluator.evaluate(support, node, *(this->port))->num(); if (port >= 0 && port <= 65535) { address << ':' << int(port); } ostringstream path, title; path << evaluator.evaluate(support, node, *(this->path))->str(); title << evaluator.evaluate(support, node, *(this->title))->str(); Page page(address.str(), path.str(), title.str()); Matcher matcher; if (page == matcher(string("^Douglas\\sThrift's\\sWebsite\\s\\|\\sDou") + "glas\\sThrift's\\sBlog:\\s(.+)$")) { if (Matcher("^\\w+\\s\\d\\d\\d\\d\\sArchives$") == matcher[1]) { page.setTitle(matcher[1]); if (newIndex(pages, page)) continue; } else continue; } else if (page == matcher("^Douglas\\sThrift's.+Website\\s\\|\\s(.+)$")) { page.setTitle(matcher[1]); if (newIndex(pages, page)) continue; } else continue; multimap items; newPages.insert(pair >(page.getAddress(), items)).first->second.insert(pair(page.getChildOf(), page)); } evaluator.destroyXPath(address); evaluator.destroyXPath(port); evaluator.destroyXPath(path); evaluator.destroyXPath(title); } bool SiteMapper::newIndex(vector& pages, Page& page) { for (unsigned index = 0; index < pages.size(); index++) { if (pages[index] == page.getAddress()) { Matcher matcher; if (pages[index] == page) { page.setChildren(pages[index].getChildren()); pages[index] = page; return true; } else if (matcher('^' + pages[index].getPath()) == page) { page.setChildOf(matcher[0]); if (matcher('^' + pages[index].getTitle() + "\\s\\|\\s(.+)$") == page) { page.setTitle(matcher[1]); } return newIndex(pages[index].getChildren(), page); } } } return false; } void SiteMapper::newMap(const string& siteMap) { for (unsigned index = 0; index < pages.size(); index++) { if (newPages.find(pages[index].getAddress()) != newPages.end()) { newMap(pages[index].getChildren(), pages[index].getPath(), newPages.find(pages[index].getAddress())->second); } cout << pages[index] << '\n'; } } void SiteMapper::newMap(vector& pages, const string& childOf, multimap& newPages) { for (unsigned index = 0; index < pages.size(); index++) { newMap(pages[index].getChildren(), pages[index].getPath(), newPages); } for (multimap::iterator itor = newPages.lower_bound(childOf); itor != newPages.upper_bound(childOf); itor++) { pages.push_back(itor->second); } newPages.erase(childOf); }