in seed/util/preprocess.py [0:0]
def clean_html(html):
try:
# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')
# Remove problematic attributes that might cause conversion issues
for tag in soup.find_all():
# List of attributes known to cause issues
problematic_attributes = ['colspan', 'rowspan', 'width', 'height']
for attr in problematic_attributes:
if attr in tag.attrs:
del tag[attr] # Removing the attribute altogether
# Using a simpler method to convert HTML to text, avoiding markdownify issues
text = soup.get_text(separator=' ', strip=True)
# Replace multiple spaces with a single space
text = ' '.join(text.split())
except Exception as e:
print(f"Error processing HTML: {e}\nHTML segment causing issue: {html[:500]}")
# Use raw text extraction as a fallback
text = BeautifulSoup(html, 'html.parser').get_text(separator=' ', strip=True)
text = ' '.join(text.split())
return text