in src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java [68:299]
private void runIdentification() {
/* Index properties:
1. Identity:
2. Exist:
3. Sequential Scattered:
4. Array:
Table 1: supported formats by row and column indexes:
# | row | col | Value | example
--------------------------------------
1 | Identity | Identity | Exist | csv, JSON/XML L single-line
2 | Identity | Exist | Exist | LibSVM single
3 | Identity | Exist | Not-Exist | LibSVM+Pattern single
4 | Exist | Exist | Exist | MM Coordinate General multi
5 | Array | Array | Exist | MM Array multi
6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric multi
7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric multi
8 | Exist | Exist | Not-Exist | MM Coordinate Pattern multi
9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern multi
10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner multi
strategy for checking the structure of indexes and values:
1. map values:
1.a values are full exist in the source
1.b values are partially exist in the dataset (we have to check the Symmetric, Skew-Symmetric, and so on)
1.c values are not exist in the source, in this case we have to check static value(s)
2. map indexes:
2.a after finding value properties the next step is looking for index maps, row index is in the first order
2.b column index mapping
*/
// value mapping
mapRow = mappingValues.getMapRow();
mapCol = mappingValues.getMapCol();
mapLen = mappingValues.getMapLen();
mappingProperties = mappingValues.getMappingProperties();
// save line by line index of string(index for Int, Long, float, Double, String, Boolean)
sampleRawIndexes = mappingValues.getSampleRawIndexes();
// matrix/frame properties for analysis and create datastructures
nrows = mappingValues.getNrows();
ncols = mappingValues.getNcols();
nlines = mappingValues.getNlines();
actualValueCount = mappingValues.getActualValueCount();
staticColIndexes = new BitSet(ncols);
// collect custom properties
// 1. properties of row-index
RowIndexStructure rowIndexStructure = getRowIndexStructure();
// 2. properties of column-index
ColIndexStructure colIndexStructure = getColIndexStructure();
properties = new CustomProperties(mappingProperties, rowIndexStructure, colIndexStructure);
properties.setNcols(ncols);
// ref to Table 1:
if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) {
// #1
if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity &&
colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) {
Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow();
properties.setColKeyPatterns(bckpsr.getKey());
properties.setEndWithValueStrings(bckpsr.getValue());
}
// #2
else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity &&
colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {
// find cell-index and value separators
RawIndex raw = null;
for(int c = 0; c < ncols; c++) {
if(mapCol[0][c] != -1) {
raw = sampleRawIndexes[mapRow[0][c]];
raw.cloneReservedPositions();
break;
}
}
if(raw == null)
throw new DMLRuntimeException("Invalid raw");
HashMap<String, Long> indexDelimCount = new HashMap<>();
String valueDelim = null;
String indexDelim = null;
Long maxCount = 0L;
int begin = colIndexStructure.getColIndexBegin();
for(int c = 0; c < ncols; c++) {
if(mapCol[0][c] != -1) {
Pair<Integer, Integer> pair = raw.findValue(c + begin);
String tmpIndexDelim = raw.getSubString(pair.getKey() + pair.getValue(), mapCol[0][c]);
if(indexDelimCount.containsKey(tmpIndexDelim))
indexDelimCount.put(tmpIndexDelim, indexDelimCount.get(tmpIndexDelim) + 1);
else
indexDelimCount.put(tmpIndexDelim, 1L);
if(maxCount < indexDelimCount.get(tmpIndexDelim)) {
maxCount = indexDelimCount.get(tmpIndexDelim);
indexDelim = tmpIndexDelim;
}
if(valueDelim == null) {
int nextPos = raw.getNextNumericPosition(mapCol[0][c] + mapLen[0][c]);
if(nextPos < raw.getRawLength()) {
valueDelim = raw.getSubString(mapCol[0][c] + mapLen[0][c], nextPos);
}
}
}
}
// update properties
colIndexStructure.setIndexDelim(indexDelim);
colIndexStructure.setValueDelim(valueDelim);
}
}
else {
// # 4, 6, 7, 8, 9
if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist &&
colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {
if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) {
Pair<ArrayList<String>, HashSet<String>> bvkpsr = buildValueKeyPattern();
HashSet<String>[] endWithValueStrings = new HashSet[1];
endWithValueStrings[0] = bvkpsr.getValue();
properties.setValueKeyPattern(bvkpsr.getKey());
properties.setEndWithValueStrings(endWithValueStrings);
}
int beginRowIndex = rowIndexStructure.getRowIndexBegin();
int beginColIndex = colIndexStructure.getColIndexBegin();
// build pattern for row-index
Pair<ArrayList<String>, HashSet<String>> rowIndexPattern = buildIndexKeyPattern(true, beginRowIndex);
rowIndexStructure.setKeyPattern(rowIndexPattern.getKey());
rowIndexStructure.setEndWithValueString(rowIndexPattern.getValue());
// build pattern for col-index
Pair<ArrayList<String>, HashSet<String>> colIndexPattern = buildIndexKeyPattern(false, beginColIndex);
colIndexStructure.setKeyPattern(colIndexPattern.getKey());
colIndexStructure.setEndWithValueString(colIndexPattern.getValue());
}
// #10 sequential scattered
if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter) {
ArrayList<Pair<String, String>> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false);
ArrayList<Pair<String, Set<Integer>>> keys;
TextTrie textTrie = new TextTrie();
textTrie.insert(prefixSuffixBeginEndCells.get(0).getKey(), 0);
char startChar = prefixSuffixBeginEndCells.get(0).getKey().charAt(0);
int minSubStringLength = Math.min(80, prefixSuffixBeginEndCells.get(0).getKey().length());
for(int i = 1; i < prefixSuffixBeginEndCells.size(); i++) {
String prefix = prefixSuffixBeginEndCells.get(i).getKey();
for(int j = 0; j < prefix.length(); j++) {
if(startChar == prefix.charAt(j))
textTrie.insert(prefix.substring(j, j + Math.min(minSubStringLength, prefix.length() - j)),
i);
}
}
// scoring the prefix tree
keys = textTrie.getAllKeys();
String beginString = null;
String endString = null;
if(keys.get(0).getValue().size() == nrows) {
int index = keys.get(0).getKey().indexOf("\n");
if(index == -1)
beginString = keys.get(0).getKey();
else
beginString = keys.get(0).getKey().substring(0, index);
// recompute suffix strings to find end of string
int minSuffixStringLength = prefixSuffixBeginEndCells.get(0).getValue().length();
String reverseBeginString = new StringBuilder(beginString).reverse().toString();
ArrayList<String> suffixes = new ArrayList<>();
for(int i = 0; i < prefixSuffixBeginEndCells.size() - 1; i++) {
String str = new StringBuilder(prefixSuffixBeginEndCells.get(i).getValue()).reverse()
.toString();
int indexBeginString = str.indexOf(reverseBeginString);
if(indexBeginString != -1) {
for(int j = indexBeginString + reverseBeginString.length(); j < str.length(); j++) {
if(str.charAt(j) == '\n')
indexBeginString++;
else
break;
}
minSuffixStringLength = Math.min(minSuffixStringLength, indexBeginString);
suffixes.add(new StringBuilder(
str.substring(0, indexBeginString + reverseBeginString.length())).reverse().toString());
}
else
suffixes.add(str);
}
StringBuilder sbEndString = new StringBuilder();
for(int i = 0; i < minSuffixStringLength; i++) {
if(suffixes.get(0).length() == 0)
break;
char intersectChar = suffixes.get(0).charAt(i);
if(intersectChar == '\n')
break;
boolean flag = true;
for(String ss : suffixes) {
if(ss.charAt(i) != intersectChar) {
flag = false;
break;
}
}
if(flag)
sbEndString.append(intersectChar);
else
break;
}
if(sbEndString.length() == 0)
endString = beginString;
else
endString = sbEndString.toString();
updateMapsAndExtractAllSuffixStringsOfColsMultiLine(beginString, endString);
rowIndexStructure.setSeqBeginString(beginString);
rowIndexStructure.setSeqEndString(endString);
Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow();
properties.setColKeyPatterns(bckpsr.getKey());
properties.setEndWithValueStrings(bckpsr.getValue());
}
else {
// TODO: extend sequential scattered format algorithm for heterogeneous structures
}
}
}
if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ||
colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {
properties.setSparse(true);
}
}