1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.rat.document.impl.guesser;
20
21 import org.apache.commons.io.IOUtils;
22 import org.apache.rat.api.Document;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.nio.ByteBuffer;
28 import java.nio.CharBuffer;
29 import java.nio.charset.Charset;
30 import java.nio.charset.CharsetDecoder;
31 import java.nio.charset.CoderResult;
32 import java.nio.charset.CodingErrorAction;
33 import java.nio.charset.UnsupportedCharsetException;
34 import java.util.Locale;
35
36
37
38
39 public class BinaryGuesser {
40
41 private static final String DOT = ".";
42
43 static final String FILE_ENCODING = "file.encoding";
44 private static Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
45
46 private static boolean isBinaryDocument(Document document) {
47 boolean result = false;
48 InputStream stream = null;
49 try {
50 stream = document.inputStream();
51 result = isBinary(stream);
52 } catch (IOException e) {
53 result = false;
54 } finally {
55 IOUtils.closeQuietly(stream);
56 }
57 return result;
58 }
59
60 private static boolean isBinary(CharSequence taste) {
61 int highBytes = 0;
62 final int length = taste.length();
63 for (int i = 0; i < length; i++) {
64 char c = taste.charAt(i);
65 if (c > BinaryGuesser.NON_ASCII_THREASHOLD
66 || c <= BinaryGuesser.ASCII_CHAR_THREASHOLD) {
67 highBytes++;
68 }
69 }
70 return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
71 > length * BinaryGuesser.TOTAL_READ_RATIO;
72 }
73
74
75
76
77
78
79
80
81
82
83 public static boolean isBinary(Reader in) {
84 char[] taste = new char[100];
85 try {
86 int bytesRead = in.read(taste);
87 if (bytesRead > 0) {
88 return isBinary(new String(taste, 0, bytesRead));
89 }
90 } catch (IOException e) {
91
92 }
93 return false;
94 }
95
96
97
98
99
100
101
102
103
104
105
106
107 public static boolean isBinary(InputStream in) {
108 try {
109 byte[] taste = new byte[200];
110 int bytesRead = in.read(taste);
111 if (bytesRead > 0) {
112 ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
113 CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
114 CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
115 .onMalformedInput(CodingErrorAction.REPORT)
116 .onUnmappableCharacter(CodingErrorAction.REPORT);
117 while (bytes.remaining() > 0) {
118 CoderResult res = cd.decode(bytes, chars, true);
119 if (res.isMalformed() || res.isUnmappable()) {
120 return true;
121 } else if (res.isOverflow()) {
122 chars.limit(chars.position());
123 chars.rewind();
124 int c = chars.capacity() * 2;
125 CharBuffer on = CharBuffer.allocate(c);
126 on.put(chars);
127 chars = on;
128 }
129 }
130 chars.limit(chars.position());
131 chars.rewind();
132 return isBinary(chars);
133 }
134 } catch (IOException e) {
135
136 }
137 return false;
138 }
139
140 static Charset getFileEncodingOrUTF8AsFallback() {
141 try {
142 return Charset.forName(System.getProperty(FILE_ENCODING));
143 } catch (UnsupportedCharsetException e) {
144 return Charset.forName("UTF-8");
145 }
146 }
147
148
149
150
151
152 public static final boolean isBinaryData(final String name) {
153 return extensionMatches(name, DATA_EXTENSIONS);
154 }
155
156
157
158
159
160 public static final boolean isNonBinary(final String name) {
161 if (name == null) {
162 return false;
163 }
164 return extensionMatches(name.toUpperCase(Locale.US),
165 BinaryGuesser.NON_BINARY_EXTENSIONS);
166 }
167
168
169
170
171
172 public static final boolean isExecutable(final String name) {
173 return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
174 || containsExtension(name, EXE_EXTENSIONS);
175 }
176
177 public static boolean containsExtension(final String name,
178 final String[] exts) {
179 for (int i = 0; i < exts.length; i++) {
180 if (name.contains(DOT + exts[i] + DOT)) {
181 return true;
182 }
183 }
184 return false;
185 }
186
187 public static boolean extensionMatches(final String name,
188 final String[] exts) {
189 for (int i = 0; i < exts.length; i++) {
190 if (name.endsWith(DOT + exts[i])) {
191 return true;
192 }
193 }
194 return false;
195 }
196
197 public static boolean isBytecode(final String name) {
198 return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
199 }
200
201 public static final boolean isImage(final String name) {
202 return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
203 }
204
205 public static final boolean isKeystore(final String name) {
206 return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
207 }
208
209
210
211
212
213 public static final boolean isBinary(final String name) {
214 if (name == null) {
215 return false;
216 }
217 String normalisedName = GuessUtils.normalise(name);
218 return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
219 || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
220 || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName);
221 }
222
223 private static final String[] DATA_EXTENSIONS = {
224 "DAT", "DOC",
225 "NCB", "IDB",
226 "SUO", "XCF",
227 "RAJ", "CERT",
228 "KS", "TS",
229 "ODP", "SWF",
230
231 "WOFF2", "WOFF", "TTF", "EOT"
232 };
233
234 private static final String[] EXE_EXTENSIONS = {
235 "EXE", "DLL",
236 "LIB", "SO",
237 "A", "EXP",
238 };
239
240 private static final String[] KEYSTORE_EXTENSIONS = {
241 "JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
242 };
243
244 private static final String[] IMAGE_EXTENSIONS = {
245 "PNG", "PDF",
246 "GIF", "GIFF",
247 "TIF", "TIFF",
248 "JPG", "JPEG",
249 "ICO", "ICNS",
250 "PSD",
251 };
252
253 private static final String[] BYTECODE_EXTENSIONS = {
254 "CLASS", "PYD",
255 "OBJ", "PYC",
256 };
257
258
259
260
261 private static final String[] NON_BINARY_EXTENSIONS = {
262 "AART",
263 "AC",
264 "AM",
265 "BAT",
266 "C",
267 "CAT",
268 "CGI",
269 "CLASSPATH",
270 "CMD",
271 "CONFIG",
272 "CPP",
273 "CSS",
274 "CWIKI",
275 "DATA",
276 "DCL",
277 "DTD",
278 "EGRM",
279 "ENT",
280 "FT",
281 "FN",
282 "FV",
283 "GRM",
284 "G",
285 "H",
286 "HTACCESS",
287 "HTML",
288 "IHTML",
289 "IN",
290 "JAVA",
291 "JMX",
292 "JSP",
293 "JS",
294 "JUNIT",
295 "JX",
296 "MANIFEST",
297 "M4",
298 "MF",
299 "MF",
300 "META",
301 "MOD",
302 "N3",
303 "PEN",
304 "PL",
305 "PM",
306 "POD",
307 "POM",
308 "PROJECT",
309 "PROPERTIES",
310 "PY",
311 "RB",
312 "RDF",
313 "RNC",
314 "RNG",
315 "RNX",
316 "ROLES",
317 "RSS",
318 "SH",
319 "SQL",
320 "SVG",
321 "TLD",
322 "TXT",
323 "TYPES",
324 "VM",
325 "VSL",
326 "WSDD",
327 "WSDL",
328 "XARGS",
329 "XCAT",
330 "XCONF",
331 "XEGRM",
332 "XGRM",
333 "XLEX",
334 "XLOG",
335 "XMAP",
336 "XML",
337 "XROLES",
338 "XSAMPLES",
339 "XSD",
340 "XSL",
341 "XSLT",
342 "XSP",
343 "XUL",
344 "XWEB",
345 "XWELCOME",
346 };
347 public static final String JAR_MANIFEST = "MANIFEST.MF";
348 public static final String JAVA = "JAVA";
349 public static final int HIGH_BYTES_RATIO = 100;
350 public static final int TOTAL_READ_RATIO = 30;
351 public static final int NON_ASCII_THREASHOLD = 256;
352 public static final int ASCII_CHAR_THREASHOLD = 8;
353
354 public static final boolean isBinary(final Document document) {
355
356
357
358 return isBinary(document.getName())
359 ||
360
361 isBinaryDocument(document);
362 }
363
364
365 }