in src/main/java/com/univocity/parsers/csv/CsvFormatDetector.java [81:238]
public void execute(char[] characters, int length) {
Set<Character> allSymbols = new HashSet<Character>();
Map<Character, Integer> symbols = new HashMap<Character, Integer>();
Map<Character, Integer> escape = new HashMap<Character, Integer>();
List<Map<Character, Integer>> symbolsPerRow = new ArrayList<Map<Character, Integer>>();
int doubleQuoteCount = 0;
int singleQuoteCount = 0;
int i;
char inQuote = '\0';
boolean afterNewLine = true;
for (i = 0; i < length; i++) {
char ch = characters[i];
if (afterNewLine && ch == comment) {
while (++i < length) {
ch = characters[i];
if (ch == '\r' || ch == '\n' || ch == normalizedNewLine) {
break;
}
}
continue;
}
if (ch == '"' || ch == '\'') {
if (inQuote == ch) { //closing quotes (potentially)
if (ch == '"') {
doubleQuoteCount++;
} else {
singleQuoteCount++;
}
if (i + 1 < length) {
char next = characters[i + 1];
if (Character.isLetterOrDigit(next) || (next <= ' ' && whitespaceRangeStart < next && next != '\n' && next != '\r')) { //no special characters after quote, might be escaping
//special character before (potentially) closing quote, might be an escape
char prev = characters[i - 1];
if (!Character.isLetterOrDigit(prev)) {
increment(escape, prev);
}
}
}
inQuote = '\0';
} else if (inQuote == '\0') {
char prev = '\0';
int j = i;
while (prev <= ' ' && --j >= 0) {
prev = characters[j];
}
if (j < 0 || !Character.isLetterOrDigit(prev)) {
inQuote = ch;
}
}
continue;
}
if (inQuote != '\0') { //keep looping until the quote is closed.
continue;
}
afterNewLine = false;
if (isSymbol(ch)) { //counts all symbols. Skips letters, digits and white spaces (except the tab character)
allSymbols.add(ch);
increment(symbols, ch);
} else if ((ch == '\r' || ch == '\n' || ch == normalizedNewLine) && symbols.size() > 0) { //got a newline and collected some symbols? Good!
afterNewLine = true;
symbolsPerRow.add(symbols);
if (symbolsPerRow.size() == MAX_ROW_SAMPLES) {
break;
}
symbols = new HashMap<Character, Integer>();
}
}
if (symbols.size() > 0 && length < characters.length) {
symbolsPerRow.add(symbols);
}
if (length >= characters.length && i >= length && symbolsPerRow.size() > 1) { // if got to the end of the buffer, discard last row. It's probably incomplete anyway.
symbolsPerRow.remove(symbolsPerRow.size() - 1);
}
Map<Character, Integer> totals = calculateTotals(symbolsPerRow);
Map<Character, Integer> sums = new HashMap<Character, Integer>();
Set<Character> toRemove = new HashSet<Character>();
//combines the number of symbols found in each row and sums the difference.
for (Map<Character, Integer> previous : symbolsPerRow) {
for (Map<Character, Integer> current : symbolsPerRow) {
for (Character symbol : allSymbols) {
Integer previousCount = previous.get(symbol);
Integer currentCount = current.get(symbol);
if (previousCount == null && currentCount == null) { // got a symbol that does not appear in all rows? Discard it.
toRemove.add(symbol);
}
if (previousCount == null || currentCount == null) {
continue;
}
increment(sums, symbol, Math.abs(previousCount - currentCount)); // we expect to always get 0 or close to 0 here, so the symbol occurs in all rows
}
}
}
sums.keySet().removeAll(toRemove);
if(allowedDelimiters.length > 0) {
Set<Character> toRetain = new HashSet<Character>();
for(char c : allowedDelimiters){
toRetain.add(c);
}
sums.keySet().retainAll(toRetain);
}
char delimiterMax = max(sums, totals, suggestedDelimiter);
char delimiterMin = min(sums, totals, suggestedDelimiter);
char delimiter;
out:
if (delimiterMax != delimiterMin) {
if (sums.get(delimiterMin) == 0 && sums.get(delimiterMax) != 0) {
delimiter = delimiterMin;
break out;
}
for(char c : allowedDelimiters){
if(c == delimiterMin){
delimiter = delimiterMin;
break out;
} else if(c == delimiterMax){
delimiter = delimiterMax;
break out;
}
}
if (totals.get(delimiterMin) > totals.get(delimiterMax)) {
delimiter = delimiterMin;
break out;
}
delimiter = delimiterMax;
} else {
delimiter = delimiterMax;
}
char quote = doubleQuoteCount >= singleQuoteCount ? '"' : '\'';
escape.remove(delimiter);
char quoteEscape = max(escape, totals, quote);
apply(delimiter, quote, quoteEscape);
}