def clean_html()

in seed/util/preprocess.py [0:0]


def clean_html(html):  
    try:
        # Parse the HTML content
        soup = BeautifulSoup(html, 'html.parser')

        # Remove problematic attributes that might cause conversion issues
        for tag in soup.find_all():
            # List of attributes known to cause issues
            problematic_attributes = ['colspan', 'rowspan', 'width', 'height']
            for attr in problematic_attributes:
                if attr in tag.attrs:
                    del tag[attr]  # Removing the attribute altogether

        # Using a simpler method to convert HTML to text, avoiding markdownify issues
        text = soup.get_text(separator=' ', strip=True)
        # Replace multiple spaces with a single space
        text = ' '.join(text.split())
    except Exception as e:
        print(f"Error processing HTML: {e}\nHTML segment causing issue: {html[:500]}")
        # Use raw text extraction as a fallback
        text = BeautifulSoup(html, 'html.parser').get_text(separator=' ', strip=True)
        text = ' '.join(text.split())
    
    return text