def get_html_from_warc()

in obelics/processors/html_extractor.py [0:0]


    def get_html_from_warc(self, warc):
        page, encoding = None, None

        with io.BytesIO(warc) as stream:
            try:
                for record in WARCIterator(stream):
                    if record.rec_type == "response":
                        page = record.content_stream().read()
                        encoding = record.rec_headers["WARC-Identified-Content-Charset"]
                        break
            except Exception as e:
                return "", str(e)

        if not encoding:
            try:
                for enc in EncodingDetector(page, is_html=True).encodings:
                    # take the first detected encoding
                    encoding = enc
                    break
            except Exception as e:
                return "", str(e)

        if (not page) or (not encoding):
            return "", "Not page or encoding"

        try:
            soup = BeautifulSoup(page, "html.parser", from_encoding=encoding)
        except Exception as e:
            return "", str(e)

        try:
            html_str = soup.decode_contents(formatter="html")
        except Exception as e:
            return "", str(e)

        try:
            html_str.encode()
        except Exception as e:
            return "", str(e)

        return html_str, ""