def get_encoding()

in readability/encoding.py [0:0]


def get_encoding(page):
    # Regex for XML and HTML Meta charset declaration
    declared_encodings = (
        RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
    )

    # Try any declared encodings
    for declared_encoding in declared_encodings:
        try:
            # Python3 only
            if sys.version_info[0] == 3:
                # declared_encoding will actually be bytes but .decode() only
                # accepts `str` type. Decode blindly with ascii because no one should
                # ever use non-ascii characters in the name of an encoding.
                declared_encoding = declared_encoding.decode("ascii", "replace")

            encoding = fix_charset(declared_encoding)
            # Now let's decode the page
            page.decode(encoding)
            # It worked!
            return encoding
        except UnicodeDecodeError:
            pass

    # Fallback to chardet if declared encodings fail
    # Remove all HTML tags, and leave only text for chardet
    text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
    enc = 'utf-8'
    if len(text) < 10:
        return enc  # can't guess
    res = chardet.detect(text)
    enc = res["encoding"] or "utf-8"
    # print '->', enc, "%.2f" % res['confidence']
    enc = fix_charset(enc)
    return enc