in src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java [1242:1436]
public Object call() throws Exception {
// Sort prefixesRemovedReverse list
for(int c :colIndexes){
keys[c] = new ArrayList<>();
Map<String, ArrayList<Integer>> mapPrefixesRemovedReverse = new HashMap<>();
for(int i=0; i<prefixesRemovedReverse[c].size(); i++) {
StringBuilder sb = new StringBuilder();
String str = prefixesRemovedReverse[c].get(i).replaceAll("\\d", Lop.OPERAND_DELIMITOR);
for(int j = 0; j< str.length(); j++){
String charStr = str.charAt(j)+"";
if(!charStr.equals(Lop.OPERAND_DELIMITOR))
sb.append(charStr);
else if(sb.length() == 0 || !(sb.charAt(sb.length() -1)+"").equals(Lop.OPERAND_DELIMITOR))
sb.append(Lop.OPERAND_DELIMITOR);
}
String sbStr = sb.toString();
if(!mapPrefixesRemovedReverse.containsKey(sbStr))
mapPrefixesRemovedReverse.put(sbStr, new ArrayList<>());
mapPrefixesRemovedReverse.get(sbStr).add(i);
}
prefixesRemovedReverse[c] = new ArrayList<>();
prefixesRemoved[c] = new ArrayList<>();
prefixesRemovedReverseSort[c] = new ArrayList<>();
for(String s: mapPrefixesRemovedReverse.keySet()){
prefixesRemovedReverseSort[c].add(new Pair<>(s, mapPrefixesRemovedReverse.get(s).get(0)));
}
prefixesRemovedReverseSort[c].sort(AscendingPairStringComparator);
for(Pair<String, Integer> pair: prefixesRemovedReverseSort[c]){
prefixesRemovedReverse[c].add(pair.getKey());
prefixesRemoved[c].add(new StringBuilder(pair.getKey()).reverse().toString());
}
}
// build patterns:
for(int c :colIndexes) {
if(prefixesRemoved[c].size() == 1){
keys[c] = new ArrayList<>();
if(prefixesRemoved[c].get(0).length() == 0 || prefixesRemoved[c].get(0).equals(Lop.OPERAND_DELIMITOR))
keys[c].add("");
String[] lcsKey = prefixesRemoved[c].get(0).split(Lop.OPERAND_DELIMITOR);
for(String sk : lcsKey)
if(sk.length() > 0)
keys[c].add(sk);
continue;
}
String firstKey;
// STEP 1: find fist key:
String selectedString = prefixesRemoved[c].get(0);
boolean flag = true;
StringBuilder sbToken = new StringBuilder();
sbToken.append(selectedString.charAt(selectedString.length() -1));
for(int i = 2; i < selectedString.length() && flag; i++) {
char ch = selectedString.charAt(selectedString.length()-i);
for(int j = 1; j < prefixesRemoved[c].size() && flag; j++) {
String str = prefixesRemoved[c].get(j);
flag = str.charAt(str.length()-i) == ch;
}
if(flag)
sbToken.append(ch);
}
firstKey = sbToken.reverse().toString();
flag = true;
String[] lcsKey = firstKey.split(Lop.OPERAND_DELIMITOR);
ArrayList<String> tmpList = new ArrayList<>();
for(String sk : lcsKey)
if(sk.length() > 0)
tmpList.add(sk);
for(int i = 0; i < prefixes[c].size() && flag; i++)
flag = getIndexOfKeyPatternOnString(prefixes[c].get(i), tmpList, 0) == prefixes[c].get(i).length();
if(flag) {
keys[c] = tmpList;
continue;
}
// STEP 2: add another keys
int indexI = 0;
int indexJ = 0;
Set<String> refineKeysStep = new HashSet<>();
do {
for(; indexI < prefixesRemovedReverseSort[c].size() - 1 && refineKeysStep.size() == 0; indexI++) {
String str1 = prefixesRemoved[c].get(indexI);
String psStr1 = prefixes[c].get(prefixesRemovedReverseSort[c].get(indexI).getValue());
for(indexJ = indexI + 1;
indexJ < prefixesRemovedReverseSort[c].size() && refineKeysStep.size() == 0;
indexJ++) {
String str2 = prefixesRemoved[c].get(indexJ);
String psStr2 = prefixes[c].get(prefixesRemovedReverseSort[c].get(indexJ).getValue());
refineKeysStep = getRefineKeysStep(lcs, str1, str2, psStr1, psStr2, firstKey);
}
}
if(indexI < prefixesRemovedReverse[c].size() -1 && indexJ < prefixesRemovedReverse[c].size())
break;
do {
Pair<Set<String>, Set<String>> pair = getNewRefineKeys(lcs, firstKey, prefixesRemoved[c], prefixes[c], refineKeysStep);
refineKeysStep = pair.getKey();
if(pair.getValue().size() == 0)
break;
else
refineKeysStep.addAll(pair.getValue());
}
while(true);
} while(refineKeysStep.size() == 0);
if(refineKeysStep.size() == 0) {
// TODO: we have to apply tokenizer
}
else if(refineKeysStep.size() == 1) {
String[] refinedLCSKey = (refineKeysStep.iterator().next()+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR);
keys[c] = new ArrayList<>();
for(String sk : refinedLCSKey)
if(sk.length() > 0)
keys[c].add(sk);
}
else{
ArrayList<String> sortedStrings = new ArrayList<>();
sortedStrings.addAll(refineKeysStep);
Collections.sort(sortedStrings, AscendingStringLengthComparator);
String[] refinedLCSKey = (sortedStrings.get(sortedStrings.size()-1)+Lop.OPERAND_DELIMITOR+firstKey).split(Lop.OPERAND_DELIMITOR);
keys[c] = new ArrayList<>();
for(String sk : refinedLCSKey)
if(sk.length() > 0)
keys[c].add(sk);
}
}
// CleanUP keys: reduce key list if it possible
for(int c :colIndexes) {
ArrayList<String> cleanUPKeys = cleanUPKey(keys[c], prefixes[c]);
// set static col flag
Boolean flagFixCol = true;
for(int r = 0; r < nrows && flagFixCol && prefixes[c].size() !=nrows; r++){
String rawStr = sampleRawIndexes[r].getRaw();
flagFixCol = getIndexOfKeyPatternOnString(rawStr, cleanUPKeys, 0) !=-1;
}
staticColIndexes.set(c, flagFixCol);
if(!flagFixCol && cleanUPKeys.size() < keys[c].size()){
String extraKey = keys[c].get(keys[c].size()-cleanUPKeys.size()-1);
if(checkExtraKeyForCol(cleanUPKeys, extraKey,prefixes[c])){
keys[c] = new ArrayList<>();
keys[c].add(extraKey);
keys[c].addAll(cleanUPKeys);
}
else
keys[c] = cleanUPKeys;
}
else
keys[c] = cleanUPKeys;
// Build suffixes
Set<String> setSuffix = new HashSet<>();
TextTrie suffixTrie = new TextTrie();
for(String su: suffixes[c]) {
String[] suffixesList = su.split(Lop.OPERAND_DELIMITOR, -1);
if(suffixesList.length > 0) {
if(suffixesList.length == 1 && suffixesList[0].length() == 0)
continue;
if(suffixesList[1].length() < suffixStringLength)
setSuffix.add(suffixesList[1]);
else
setSuffix.add(suffixesList[1].substring(0, suffixStringLength));
}
}
if(setSuffix.size() == 0) {
colSuffixes[c] = new HashSet<>();
continue;
}
int rowIndexSuffix = 0;
for(String ss: setSuffix){
suffixTrie.insert(ss, rowIndexSuffix++);
}
HashSet<String> colSuffixe = new HashSet<>();
ArrayList<Pair<String, Set<Integer>>> allSuffixes = suffixTrie.getAllKeys();
if(allSuffixes.get(0).getValue().size() == setSuffix.size())
colSuffixe.add(allSuffixes.get(0).getKey());
else {
Set<Integer> coveredRowIndexes = new HashSet<>();
for(Pair<String, Set<Integer>> p: allSuffixes){
int currentSize = coveredRowIndexes.size();
coveredRowIndexes.addAll(p.getValue());
if(currentSize != coveredRowIndexes.size())
colSuffixe.add(p.getKey());
}
}
colSuffixes[c] = colSuffixe;
}
return new Pair<>(keys, colSuffixes);
}