modify-deadlink.py (97 lines of code) (raw):
import os
import sys
import re
from collections import namedtuple
# Define a structure to store information, added the 'sed_str' field
FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])
def find_file(file_str, search_dir, line_content):
# Initialize result list
results = []
# Extract the second file path (including the line number)
match = re.search(r"in file '([^']+)'", file_str)
if match:
base_file = match.group(1) # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
parts = base_file.split(":")
base_file_path = parts[0] # Remove the line number part to get the file path
line_number = parts[1] if len(parts) > 1 else "" # The part after the colon
# Get the root directory of the second file path
root_dir = os.path.dirname(base_file_path)
# Extract the first file path based on the 'link' in the log line
match = re.search(r"link '([^']+)'", file_str) # Extract the path after 'link'
if match:
filename = match.group(1)
# Get the base file name (remove the path part)
file_base_name = os.path.basename(filename)
# Create the target file name, check if it already has a .md extension
if not file_base_name.endswith(".md"):
target_filename = f"{file_base_name}.md"
else:
target_filename = file_base_name
# Check if the file exists in the directory and count the number of occurrences
found_files = []
for root, dirs, files in os.walk(search_dir):
if target_filename in files:
file_path = os.path.join(root, target_filename)
found_files.append(file_path)
# Store the result in the structure array
if found_files:
url_count = 0
relative_url = ""
for file in found_files:
# Calculate the relative file path
url_path = os.path.relpath(file, os.getcwd())
url_count += 1
# If only one URL is found, output the relative path from the file directory
if url_count == 1:
relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path))
# Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix
if not relative_url.startswith("../"):
relative_url = "./" + relative_url
if relative_url.endswith(".md"):
relative_url = relative_url[:-3]
# Extract the origin_url (from log_error, extracting the path after 'link' in quotes)
origin_url_match = re.search(r"link '([^']+)'", line_content) # Find the content following 'link'
origin_url = origin_url_match.group(1) if origin_url_match else ""
# Create the sed_str command (valid only when url_count is 1)
sed_str = ""
if url_count == 1:
sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}"
# Store the result in the structure array
file_info = FileInfo(
target_file=base_file_path,
url_line=line_number,
url_path=url_path,
url_count=url_count,
relative_url=relative_url,
log_error=line_content, # Store the current line content
origin_url=origin_url, # Store origin_url
sed_str=sed_str # Store sed command
)
results.append(file_info)
else:
print(f"[ERR] No file named {target_filename} found in {search_dir}.")
print(f"[ERR] Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
else:
print(f"No valid file path found in the input string.")
print(f"Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
else:
print(f"No valid base file path found in the input string.")
print(f"Error log: {line_content}") # Output the current error log
print("-" * 80) # Print the separator line
return results
# New function: Read the file and call find_file
def get_deadlink(file_path, search_dir):
results = []
if os.path.isfile(file_path): # Check if it's a valid file
with open(file_path, 'r') as file:
for line in file:
line = line.strip() # Remove possible spaces and newline characters
# Call find_file for each line and pass the current line content
results.extend(find_file(line, search_dir, line)) # Append the result of each line to the results list
else:
print(f"{file_path} is not a valid file.") # Print if the file is invalid
return results
# Print the results from the structure array
def print_results(results):
for result in results:
print(f"[LOG] target_file >> {result.target_file}")
print(f"[LOG] url_line >> {result.url_line}")
print(f"[LOG] url_path >> {result.url_path}")
print(f"[LOG] url_count >> {result.url_count}")
print(f"[LOG] relative_url >> {result.relative_url}")
print(f"[LOG] log_error >> {result.log_error}") # Print log_error
print(f"[LOG] origin_url >> {result.origin_url}") # Print origin_url
print(f"[LOG] sed_str >> {result.sed_str}") # Print sed_str
print("----------------------------------------------------------------")
if __name__ == "__main__":
# Get input arguments
if len(sys.argv) != 3:
print("Usage: python find_file.py '<file_with_logs>' <search_dir>") # Print usage message
sys.exit(1)
file_with_logs = sys.argv[1] # Get the file path
search_dir = sys.argv[2] # Get the search directory
# Process the file and get results
results = get_deadlink(file_with_logs, search_dir)
# Print the results from the structure array
print_results(results)