in obelics/processors/html_extractor.py [0:0]
def get_html_from_warc(self, warc):
page, encoding = None, None
with io.BytesIO(warc) as stream:
try:
for record in WARCIterator(stream):
if record.rec_type == "response":
page = record.content_stream().read()
encoding = record.rec_headers["WARC-Identified-Content-Charset"]
break
except Exception as e:
return "", str(e)
if not encoding:
try:
for enc in EncodingDetector(page, is_html=True).encodings:
# take the first detected encoding
encoding = enc
break
except Exception as e:
return "", str(e)
if (not page) or (not encoding):
return "", "Not page or encoding"
try:
soup = BeautifulSoup(page, "html.parser", from_encoding=encoding)
except Exception as e:
return "", str(e)
try:
html_str = soup.decode_contents(formatter="html")
except Exception as e:
return "", str(e)
try:
html_str.encode()
except Exception as e:
return "", str(e)
return html_str, ""