// Site Mapper // // Douglas Thrift // // $Id$ #include #include #include #include #include "SiteMapper.hpp" int Main(const app::Options& options) { SiteMapper::program = api::GetExecutablePath().GetName(); cse::String siteIndex, siteMap; api::Pcre::RegEx index(_B("^-index=(.+)$")), map(_B("^-map=(.+)$")); _foreach (const app::ArgumentList, arg, app::GetArguments()) { api::Pcre::RegEx::Match match; if (match = index(*arg)) { siteIndex = match[1]; } else if (match = map(*arg)) { siteMap = match[1]; } else if (*arg == _B("-D")) { if (!SiteMapper::debug) SiteMapper::debug = true; } } if (!siteIndex.IsEmpty() && !siteMap.IsEmpty()) SiteMapper mapper(siteIndex, siteMap); else api::Cout << _B("Usage: ") << SiteMapper::program << _B(" -index=index -map=map [-D]") << ios::NewLine; return 0; } SiteMapper::SiteMapper(const cse::String& siteIndex, const cse::String& siteMap) { oldMap(siteMap); newIndex(siteIndex); newMap(siteMap); } cse::String SiteMapper::program; bool SiteMapper::debug(false); void SiteMapper::oldMap(const cse::String& siteMap) { ext::Handle document(xml::Parse(siteMap)); ext::Handle list(*document/_B("page")/_B("section")/_B("list")); comment = *document/_B("comment()"); if (debug) api::Cerr << _B("comment = ") << comment << ios::NewLine; oldMap(pages, list); } void SiteMapper::oldMap(ext::Vector& pages, xml::Node* list) { xml::NodeSet nodes(*list/_B("item")); _foreach (xml::NodeSet, node, nodes) { cse::String url(**node/_B("link")/_B("@address")), title(**node/_B("link")); Page page(url, title); ext::Handle list(**node/_B("list")); if (!list.IsEmpty()) oldMap(page.GetChildren(), list); pages.InsertLast(page); } } void SiteMapper::newIndex(const cse::String& siteIndex) { ext::Handle document(xml::Parse(siteIndex)); xml::NodeSet nodes(*document/_B("index")/_B("page")); _foreach (xml::NodeSet, node, nodes) { _S address(**node/_B("address")); cse::String port(**node/_B("port")); if (!port.IsEmpty()) address << _B(":") << port; cse::String path(**node/_B("path")), title(**node/_B("title")); Page page(address, path, title); static api::Pcre::RegEx blog(_B("^Douglas\\sThrift's\\sWebsite\\s\\|\\sDouglas\\sThrift's\\sBlog:\\s(.+)$")), page_(_B("^Douglas\\sThrift's.+Website\\s\\|\\s(.+)$")); if (api::Pcre::RegEx::Match match = blog(page.GetTitle())) { static api::Pcre::RegEx archives(_B("^\\w+\\s\\d{4}\\sArchives$")); if (archives(match[1])) { page.SetTitle(match[1]); if (newIndex(pages, page)) continue; } else continue; } else if (api::Pcre::RegEx::Match match = page_(page.GetTitle())) { page.SetTitle(match[1]); if (newIndex(pages, page)) continue; } else continue; std::multimap items; newPages.insert(std::pair >(page.GetAddress(), items)).first->second.insert(std::pair(page.GetChildOf(), page)); } } bool SiteMapper::newIndex(ext::Vector& pages, Page& page) { _foreach (ext::Vector, page_, pages) { if (*page_ == page.GetAddress()) { if (*page_ == page) { page.SetChildren(page_->GetChildren()); *page_ = page; api::Cout << _B("Updated: ") << page.GetUrl() << ios::NewLine; return true; } else if (page.GetPath().StartsWithAll(page_->GetPath())) { page.SetChildOf(page_->GetPath()); api::Pcre::RegEx title(_S() << _B("^") << page_->GetTitle() << "\\s\\|\\s(.+)$"); if (api::Pcre::RegEx::Match match = title(page.GetTitle())) page.SetTitle(match[1]); return newIndex(page_->GetChildren(), page); } } } return false; } void SiteMapper::newMap(const cse::String& siteMap) { _S file(siteMap); _S fout(file); _S xml(file); fout << ios::NewLine << _B("") << ios::NewLine << _B(""); xml.OutputComment(comment); xml::ScopeElement page(xml, _B("page")); xml.OpenElement(_B("title")); xml.OutputText(_B("Sitemap")); xml.CloseElement(); xml::ScopeElement section(xml, _B("section")), list(xml, _B("list")); _foreach (ext::Vector, page, pages) { if (newPages.find(page->GetAddress()) != newPages.end()) newMap(page->GetChildren(), page->GetPath(), newPages.find(page->GetAddress())->second); xml << *page; } } void SiteMapper::newMap(ext::Vector& pages, const cse::String& childOf, std::multimap& newPages) { _foreach (ext::Vector, page, pages) newMap(page->GetChildren(), page->GetPath(), newPages); typedef std::multimap MultiMap; _forall (MultiMap::const_iterator, itor, newPages.lower_bound(childOf), newPages.upper_bound(childOf)) { api::Cout << _B("Added: ") << itor->second.GetUrl() << ios::NewLine; pages.InsertLast(itor->second); } newPages.erase(childOf); }