in src/encoding.c [46:182]
private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
#ifdef DEBUG_ENCODING
#define DPRINTF(a) printf a
#else
#define DPRINTF(a)
#endif
/*
* Try to determine whether text is in some character code we can
* identify. Each of these tests, if it succeeds, will leave
* the text converted into one-file_unichar_t-per-character Unicode in
* ubuf, and the number of characters converted in ulen.
*/
protected int
file_encoding(struct magic_set *ms, const struct buffer *b,
file_unichar_t **ubuf, size_t *ulen, const char **code,
const char **code_mime, const char **type)
{
const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
size_t nbytes = b->flen;
size_t mlen;
int rv = 1, ucs_type;
unsigned char *nbuf = NULL;
file_unichar_t *udefbuf;
size_t udeflen;
if (ubuf == NULL)
ubuf = &udefbuf;
if (ulen == NULL)
ulen = &udeflen;
*type = "text";
*ulen = 0;
*code = "unknown";
*code_mime = "binary";
if (nbytes > ms->encoding_max)
nbytes = ms->encoding_max;
mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
if (*ubuf == NULL) {
file_oomem(ms, mlen);
goto done;
}
mlen = (nbytes + 1) * sizeof(nbuf[0]);
if ((nbuf = CAST(unsigned char *,
calloc(CAST(size_t, 1), mlen))) == NULL) {
file_oomem(ms, mlen);
goto done;
}
if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "Unicode text, UTF-7";
*code_mime = "utf-7";
} else {
DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
*code = "ASCII";
*code_mime = "us-ascii";
}
} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
*code = "Unicode text, UTF-8 (with BOM)";
*code_mime = "utf-8";
} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "Unicode text, UTF-8";
*code_mime = "utf-8";
} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Unicode text, UTF-32, little-endian";
*code_mime = "utf-32le";
} else {
*code = "Unicode text, UTF-32, big-endian";
*code_mime = "utf-32be";
}
DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Unicode text, UTF-16, little-endian";
*code_mime = "utf-16le";
} else {
*code = "Unicode text, UTF-16, big-endian";
*code_mime = "utf-16be";
}
DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "ISO-8859";
*code_mime = "iso-8859-1";
} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
*code = "Non-ISO extended-ASCII";
*code_mime = "unknown-8bit";
} else {
from_ebcdic(buf, nbytes, nbuf);
if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
*code = "EBCDIC";
*code_mime = "ebcdic";
} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
*ulen));
*code = "International EBCDIC";
*code_mime = "ebcdic";
} else { /* Doesn't look like text at all */
DPRINTF(("binary\n"));
rv = 0;
*type = "binary";
}
}
done:
free(nbuf);
if (ubuf == &udefbuf)
free(udefbuf);
return rv;
}