private int looks_ascii()

in src/encoding.c [46:182]


private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
    size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);

#ifdef DEBUG_ENCODING
#define DPRINTF(a) printf a
#else
#define DPRINTF(a)
#endif

/*
 * Try to determine whether text is in some character code we can
 * identify.  Each of these tests, if it succeeds, will leave
 * the text converted into one-file_unichar_t-per-character Unicode in
 * ubuf, and the number of characters converted in ulen.
 */
protected int
file_encoding(struct magic_set *ms, const struct buffer *b,
    file_unichar_t **ubuf, size_t *ulen, const char **code,
    const char **code_mime, const char **type)
{
	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
	size_t nbytes = b->flen;
	size_t mlen;
	int rv = 1, ucs_type;
	unsigned char *nbuf = NULL;
	file_unichar_t *udefbuf;
	size_t udeflen;

	if (ubuf == NULL)
		ubuf = &udefbuf;
	if (ulen == NULL)
		ulen = &udeflen;

	*type = "text";
	*ulen = 0;
	*code = "unknown";
	*code_mime = "binary";

	if (nbytes > ms->encoding_max)
		nbytes = ms->encoding_max;

	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
	if (*ubuf == NULL) {
		file_oomem(ms, mlen);
		goto done;
	}
	mlen = (nbytes + 1) * sizeof(nbuf[0]);
	if ((nbuf = CAST(unsigned char *,
	    calloc(CAST(size_t, 1), mlen))) == NULL) {
		file_oomem(ms, mlen);
		goto done;
	}

	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
			*code = "Unicode text, UTF-7";
			*code_mime = "utf-7";
		} else {
			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
			*code = "ASCII";
			*code_mime = "us-ascii";
		}
	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
		*code = "Unicode text, UTF-8 (with BOM)";
		*code_mime = "utf-8";
	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
		*code = "Unicode text, UTF-8";
		*code_mime = "utf-8";
	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
		if (ucs_type == 1) {
			*code = "Unicode text, UTF-32, little-endian";
			*code_mime = "utf-32le";
		} else {
			*code = "Unicode text, UTF-32, big-endian";
			*code_mime = "utf-32be";
		}
		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
		if (ucs_type == 1) {
			*code = "Unicode text, UTF-16, little-endian";
			*code_mime = "utf-16le";
		} else {
			*code = "Unicode text, UTF-16, big-endian";
			*code_mime = "utf-16be";
		}
		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
		*code = "ISO-8859";
		*code_mime = "iso-8859-1";
	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
		*code = "Non-ISO extended-ASCII";
		*code_mime = "unknown-8bit";
	} else {
		from_ebcdic(buf, nbytes, nbuf);

		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
			*code = "EBCDIC";
			*code_mime = "ebcdic";
		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
			    *ulen));
			*code = "International EBCDIC";
			*code_mime = "ebcdic";
		} else { /* Doesn't look like text at all */
			DPRINTF(("binary\n"));
			rv = 0;
			*type = "binary";
		}
	}

 done:
	free(nbuf);
	if (ubuf == &udefbuf)
		free(udefbuf);

	return rv;
}