dev-tools/scripts/refguide/refguide-add-canonical-url.py (135 lines of code) (raw):
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script processes all static html files for Solr's reference guide
and adds canonical URLs for old pages (Solr 6 - Solr 8, Solr 9+ should not
be affected). Since Google doesn't always respect the canonical URL
directive, the meta tag for robots "noindex" is also added to ensure these
outdated pages do not show up on Google search results.
This script uses the same logic as the htaccess generation script to
determine which pages are the "last" versions of that page, so that it can
be indexed by google as the most recent information.
"""
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import argparse
robots_no_index_html = "<meta name=\"robots\" content=\"noindex\">"
def lines_from_file(filename):
with open(filename, 'r') as fp:
lines = []
for line in fp.readlines():
if line.startswith("#") or len(line.strip()) == 0:
continue
lines.append(line.replace(".adoc", ".html").strip())
return lines
def generate_canonical_mapping(conf):
new = {}
name_map = {}
print("Reading config")
old = lines_from_file(conf.old)
for line in lines_from_file(conf.new):
(path, file) = line.split("/")
new[file] = line
for line in lines_from_file(conf.mapping):
(frm, to) = line.split(";")
name_map[frm] = to
# Files in src/old-pages as of 2022-02-04
old_pages = ["configuration-apis.html", "configuration-guide.html", "controlling-results.html", "deployment-guide.html", "enhancing-queries.html", "field-types.html", "fields-and-schema-design.html", "getting-started.html", "indexing-data-operations.html", "installation-deployment.html", "monitoring-solr.html", "query-guide.html", "scaling-solr.html", "schema-indexing-guide.html", "solr-concepts.html", "solr-schema.html", "solrcloud-clusters.html", "user-managed-clusters.html"]
result = {}
old_guide = []
failed = {}
regex_new = {}
print("Converting...")
for frm in old:
if frm in new:
(subpath, name) = new[frm].split("/")
if subpath not in regex_new:
regex_new[subpath] = []
regex_new[subpath].append(name.split(".html")[0])
elif frm in name_map:
new_name = name_map[frm]
new_name_without_anchor = new_name
anchor = ""
anchor_index = new_name.find("#")
if anchor_index > 0:
new_name_without_anchor = new_name[:anchor_index]
anchor = new_name[anchor_index:]
if new_name_without_anchor.startswith("https://"):
result[frm] = new_name
elif new_name_without_anchor in new:
result[frm] = new[new_name_without_anchor] + anchor
elif new_name_without_anchor.startswith("/guide/"):
result[frm] = new_name[7:]
elif new_name_without_anchor == "_8_11":
old_guide.append(frm.split(".html")[0])
else:
failed[frm] = "Mapped value %s not in new guide" % new_name_without_anchor
elif frm in old_pages:
failed[frm] = "Not yet mapped (in src/old-pages)"
else:
failed[frm] = "404"
mappings = {
"index.html": "https://solr.apache.org/guide/solr/latest/index.html",
}
# Add direct mappings from old to new files
for key in regex_new:
for file in regex_new[key]:
mappings[file + ".html"] = f"https://solr.apache.org/guide/solr/latest/{key}/{file}.html"
# Add mappings for renamed files
for key in result:
if result[key].startswith("https://"):
mappings[key] = result[key]
else:
mappings[key] = f"https://solr.apache.org/guide/solr/latest/{result[key]}"
# Add mappings for files removed in 9.0, they will be canonical to 8.11
for file in old_guide:
mappings[file + ".html"] = f"https://solr.apache.org/guide/8_11/{file}.html"
for (key, value) in mappings.items():
print(key, value)
return mappings
def extract_filename_from_path(html_file_path):
"""Extract filename from path."""
match = re.search(r'/([^\/]+)$', html_file_path)
return match.group(1) if match else None
def process_html_file(html_file_path, url, mappings):
"""Process an HTML file to localize external JS and CSS references."""
with open(html_file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
file_name = extract_filename_from_path(html_file_path)
if file_name and file_name in mappings:
canonical_url = mappings[file_name]
else:
canonical_url = url
if canonical_url == url:
print(f"Skipped {html_file_path}, filename {file_name}, it is the canonical url: {url}")
return
canonical_link_html = f"<link rel=\"canonical\" href=\"{canonical_url}\">\n"
new_lines = []
found_title = False
for line in lines:
soup = BeautifulSoup(line, "html.parser")
title = soup.find("title")
canon_link = soup.find("link", attrs={'rel': 'canonical'})
if title and not found_title:
new_lines.append(line)
new_lines.append(canonical_link_html)
new_lines.append(robots_no_index_html)
found_title = True
elif not (found_title and canon_link):
# Skip any other canonical url we find
new_lines.append(line)
if found_title:
with open(html_file_path, "w", encoding="utf-8") as f:
f.writelines(new_lines)
print(f"Updated {html_file_path} to canonical url: {canonical_url}")
def main():
parser = argparse.ArgumentParser(description='Process HTML files to add Canonical URLs to old ref guide pages')
parser.add_argument('--old', required=True, help='Old pagenames file, one .adoc filename per line')
parser.add_argument('--new', required=True, help='New pagenames file, one .adoc filename per line')
parser.add_argument('--mapping', required=True, help='Semicolon separated from-to file names (adoc)')
parser.add_argument('--folder', help='Folder of svn checkout (https://svn.apache.org/repos/infra/sites/solr/guide/)')
args = parser.parse_args()
mappings = generate_canonical_mapping(args)
base_dir = args.folder
# Iterate over the folder structure
folders = [name for name in os.listdir(base_dir) if re.match(r'\d+_\d+', name)]
if not folders:
print(f"No versioned directories 'N_M' found in {base_dir}, exiting.")
return
for root_dir in folders:
print(f"\nProcessing directory {root_dir}")
print(f"=================================")
full_path = os.path.join(base_dir, root_dir)
if not os.path.exists(full_path):
print(f"Directory {full_path} not found, skipping.")
continue
# Process each HTML file in the directory
for filename in os.listdir(full_path):
if filename.endswith(".html"):
html_file_path = os.path.join(full_path, filename)
url = f"https://solr.apache.org/guide/{root_dir}/{filename}"
process_html_file(html_file_path, url, mappings)
if __name__ == "__main__":
main()