utils/sitemap.py (105 lines of code) (raw):

import os import re from collections import namedtuple from xml.dom import minidom from argparse import ArgumentParser DEF_SITEMAP_FILENAME = "sitemap.xml" DEF_WEB_ROOT = "https://lets-plot.org" Transformation = namedtuple('Transformation', ['regex', 'function']) class SitemapURL: to_remove = False changefreq = None priority = None def __init__(self, path: str, web_root: str): self._path = path[1:] if path.startswith("/") else path self.url = "{0}/{1}".format(web_root, self._path) def apply(self, transformation): if self._check(transformation): return transformation.function(self) else: return self def append_to_xml(self, xml_doc, urlset): if self.to_remove: return url_node = xml_doc.createElement('url') loc_node = xml_doc.createElement('loc') loc_node.appendChild(xml_doc.createTextNode(self.url)) url_node.appendChild(loc_node) if self.changefreq is not None: changefreq_node = xml_doc.createElement('changefreq') changefreq_node.appendChild(xml_doc.createTextNode(self.changefreq)) url_node.appendChild(changefreq_node) if self.priority is not None: priority_node = xml_doc.createElement('priority') priority_node.appendChild(xml_doc.createTextNode(str(self.priority))) url_node.appendChild(priority_node) urlset.appendChild(url_node) def _check(self, transformation): return transformation.regex.match(self._path) is not None def __str__(self): return '<SitemapURL url="{0}"{1}{2}{3}/>'.format( self.url, '' if self.changefreq is None else ' changefreq="{0}"'.format(self.changefreq), '' if self.priority is None else ' priority="{0}"'.format(self.priority), '' if not self.to_remove else ' remove="true"', ) def _set_to_remove(to_remove: bool = True): def update(sitemap_url: SitemapURL) -> SitemapURL: sitemap_url.to_remove = to_remove return sitemap_url return update def _set_changefreq(changefreq: str): valid_changefreq = ['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'] assert changefreq in valid_changefreq, "Unknown value of changefreq: '{0}'".format(changefreq) def update(sitemap_url: SitemapURL) -> SitemapURL: sitemap_url.changefreq = changefreq return sitemap_url return update def _set_priority(priority: float): assert priority >= 0.0, "Bad value of priority: {0} < 0.0".format(priority) assert priority <= 1.0, "Bad value of priority: {0} > 1.0".format(priority) def update(sitemap_url: SitemapURL) -> SitemapURL: sitemap_url.priority = float(priority) return sitemap_url return update transformations = [ Transformation(re.compile(r"^404\.html$"), _set_to_remove()), Transformation(re.compile(r"^_static\/.*"), _set_to_remove()), Transformation(re.compile(r"^genindex\.html$"), _set_to_remove()), Transformation(re.compile(r"^search\.html$"), _set_to_remove()), Transformation(re.compile(r"^pages\/"), _set_to_remove()), Transformation(re.compile(r"^python\/shared"), _set_to_remove()), Transformation(re.compile(r"^python\/pages\/include"), _set_to_remove()), Transformation(re.compile(r"^kotlin\/api-reference\/navigation\.html$"), _set_to_remove()), Transformation(re.compile(r"^kotlin\/-lets--plot--kotlin\/.*"), _set_to_remove()), Transformation(re.compile(r"^(?!.*index\.html$)kotlin\/api-reference\/-lets--plot--kotlin.*$"), _set_to_remove()), Transformation(re.compile(r"^python\/pages\/api\.html$"), _set_changefreq('monthly')), Transformation(re.compile(r"^python\/pages\/whats_new\.html$"), _set_changefreq('monthly')), Transformation(re.compile(r"^kotlin\/api-reference\/index\.html$"), _set_changefreq('monthly')), Transformation(re.compile(r"^python\/pages\/api\/.*"), _set_priority(.8)), Transformation(re.compile(r"^kotlin\/api-reference\/-lets--plot--kotlin\/.*\/index\.html$"), _set_priority(.8)), ] def _get_all_html_filenames(html_dir: str): for root, dirs, files in os.walk(html_dir): for filename in files: if filename.endswith(".html"): yield os.path.join(root.replace(html_dir, ""), filename).replace("\\", "/") def generate_sitemap(html_dir: str, sitemap_filename: str = DEF_SITEMAP_FILENAME, *, web_root: str = DEF_WEB_ROOT): xml_doc = minidom.Document() urlset = xml_doc.createElement("urlset") urlset.setAttribute('xmlns', "http://www.sitemaps.org/schemas/sitemap/0.9") xml_doc.appendChild(urlset) for html_filename in sorted(_get_all_html_filenames(html_dir)): sitemap_url = SitemapURL(html_filename, web_root) for transformation in transformations: sitemap_url = sitemap_url.apply(transformation) sitemap_url.append_to_xml(xml_doc, urlset) with open(sitemap_filename, "wb") as f: f.write(xml_doc.toprettyxml(indent=" ", encoding='utf-8')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('-i', '--input', required=True, metavar='INPUT', help="Path to the directory with html pages.") parser.add_argument('-f', '--filename', required=True, metavar='FILE_NAME', help="Path to the sitemap xml file.") parser.add_argument('-w', '--web_root', default=DEF_WEB_ROOT, metavar='WEB_ROOT', help="Site address.") args = parser.parse_args() generate_sitemap(html_dir=args.input, sitemap_filename=args.filename, web_root=args.web_root)