// Site Mapper // // Douglas Thrift // // $Id$ #include #include #include #include #include "SiteMapper.hpp" int Main(const app::Options &options) { SiteMapper::program = api::GetExecutablePath().GetName(); cse::String siteIndex, siteMap; api::Pcre::RegEx index(_B("^-index=(.+)$")), map(_B("^-map=(.+)$")); _foreach (const app::ArgumentList, arg, app::GetArguments()) { api::Pcre::RegEx::Match match; if (match = index(*arg)) { siteIndex = match[1]; } else if (match = map(*arg)) { siteMap = match[1]; } else if (*arg == _B("-D")) { if (!SiteMapper::debug) SiteMapper::debug = true; } } if (!siteIndex.IsEmpty() && !siteMap.IsEmpty()) SiteMapper mapper(siteIndex, siteMap); else api::Cout << _B("Usage: ") << SiteMapper::program << _B(" -index=index -map=map [-D]") << ios::NewLine; return 0; } SiteMapper::SiteMapper(const cse::String &siteIndex, const cse::String &siteMap) { oldMap(siteMap); newIndex(siteIndex); newMap(siteMap); } cse::String SiteMapper::program; bool SiteMapper::debug(false); void SiteMapper::oldMap(const cse::String &siteMap) { ext::Handle document(xml::Parse(siteMap)); ext::Handle list(*document/_B("page")/_B("section")/_B("list")); comment = *document/_B("comment()"); if (debug) api::Cerr << _B("comment = ") << comment << ios::NewLine; oldMap(pages, list); } void SiteMapper::oldMap(ext::Vector &pages, xml::Node* list) { xml::NodeSet nodes(*list/_B("item")); _foreach (xml::NodeSet, node, nodes) { cse::String url(**node/_B("link")/_B("@address")), title(**node/_B("link")); Page page(url, title); ext::Handle list(**node/_B("list")); if (!list.IsEmpty()) oldMap(page.GetChildren(), list); pages.InsertLast(page); } } void SiteMapper::newIndex(const cse::String &siteIndex) { ext::Handle document(xml::Parse(siteIndex)); xml::NodeSet nodes(*document/_B("index")/_B("page")); _foreach (xml::NodeSet, node, nodes) { _S address(**node/_B("address")); cse::String port(**node/_B("port")); if (!port.IsEmpty()) address << _B(":") << port; cse::String path(**node/_B("path")), title(**node/_B("title")); Page page(address, path, title); static api::Pcre::RegEx blog(_B("^Douglas\\sThrift's\\sWebsite\\s\\|\\sDouglas\\sThrift's\\sBlog:\\s(.+)$")), page_(_B("^Douglas\\sThrift's.+Website\\s\\|\\s(.+)$")); if (api::Pcre::RegEx::Match match = blog(page.GetTitle())) { static api::Pcre::RegEx archives(_B("^\\w+\\s\\d{4}\\sArchives$")); if (archives(match[1])) { page.SetTitle(match[1]); if (newIndex(pages, page)) continue; } else continue; } else if (api::Pcre::RegEx::Match match = page_(page.GetTitle())) { page.SetTitle(match[1]); if (newIndex(pages, page)) continue; } else continue; newPages[page.GetAddress()][page.GetChildOf()].InsertLast(page); } } bool SiteMapper::newIndex(ext::Vector &pages, Page &page) { _foreach (ext::Vector, page_, pages) { if (*page_ == page.GetAddress()) { if (*page_ == page) { page.SetChildren(page_->GetChildren()); *page_ = page; api::Cout << _B("Updated: ") << page.GetUrl() << ios::NewLine; return true; } else if (page.GetPath().StartsWithAll(page_->GetPath())) { page.SetChildOf(page_->GetPath()); api::Pcre::RegEx title(_S() << _B("^") << page_->GetTitle() << "\\s\\|\\s(.+)$"); if (api::Pcre::RegEx::Match match = title(page.GetTitle())) page.SetTitle(match[1]); return newIndex(page_->GetChildren(), page); } } } return false; } void SiteMapper::newMap(const cse::String &siteMap) { _S file(siteMap); _S fout(file); _S xml(file); // XXX: xml::TextWriter should have this kind of stuff, no? fout << ios::NewLine << _B("") << ios::NewLine << _B(""); xml.OutputComment(comment); xml::ScopeElement page(xml, _B("page")); xml.OpenElement(_B("title")); xml.OutputText(_B("Sitemap")); xml.CloseElement(); xml::ScopeElement section(xml, _B("section")), list(xml, _B("list")); _foreach (ext::Vector, page, pages) { if (newPages.Contains(page->GetAddress())) newMap(page->GetChildren(), page->GetPath(), newPages.Find(page->GetAddress())->Second()); xml << *page; } } void SiteMapper::newMap(ext::Vector &pages, const cse::String &childOf, ext::RedBlackMap, LessThan> &newPages) { _foreach (ext::Vector, page, pages) newMap(page->GetChildren(), page->GetPath(), newPages); _foreach (ext::Vector, page, newPages[childOf]) { api::Cout << _B("Added: ") << page->GetUrl() << ios::NewLine; pages.InsertLast(*page); } newPages.Remove(childOf); }