private void runIdentification()

in src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java [68:299]


	private void runIdentification() {

		/* Index properties:
		 1. Identity:
		 2. Exist:
		 3. Sequential Scattered:
		 4. Array:

		Table 1: supported formats by row and column indexes:
		 #  |  row      |  col     | Value |  example
		 --------------------------------------
		 1  | Identity  | Identity | Exist                   | csv, JSON/XML L                 single-line
		 2  | Identity  | Exist    | Exist                   | LibSVM                          single
		 3  | Identity  | Exist    | Not-Exist               | LibSVM+Pattern                  single
		 4  | Exist     | Exist    | Exist                   | MM Coordinate General           multi
		 5  | Array     | Array    | Exist                   | MM Array                        multi
		 6  | Exist     | Exist    | Partially-Exist         | MM Coordinate Symmetric         multi
		 7  | Exist     | Exist    | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric    multi
		 8  | Exist     | Exist    | Not-Exist               | MM Coordinate Pattern           multi
		 9  | Exist     | Exist    | Not-Exist+Pattern       | MM Coordinate Symmetric Pattern multi
		 10 | SEQSCATTER| Identity | Exist                   | JSON/XML Multi Line, AMiner     multi

		strategy for checking the structure of indexes and values:
			1. map values:
				1.a values are full exist in the source
				1.b values are partially exist in the dataset (we have to check the Symmetric, Skew-Symmetric, and so on)
				1.c values are not exist in the source, in this case we have to check static value(s)
			2. map indexes:
				2.a after finding value properties the next step is looking for index maps, row index is in the first order
				2.b column index mapping
		 */

		// value mapping
		mapRow = mappingValues.getMapRow();
		mapCol = mappingValues.getMapCol();
		mapLen = mappingValues.getMapLen();
		mappingProperties = mappingValues.getMappingProperties();

		// save line by line index of string(index for Int, Long, float, Double, String, Boolean)
		sampleRawIndexes = mappingValues.getSampleRawIndexes();

		// matrix/frame properties for analysis and create datastructures
		nrows = mappingValues.getNrows();
		ncols = mappingValues.getNcols();
		nlines = mappingValues.getNlines();
		actualValueCount = mappingValues.getActualValueCount();
		staticColIndexes = new BitSet(ncols);

		// collect custom properties
		// 1. properties of row-index
		RowIndexStructure rowIndexStructure = getRowIndexStructure();

		// 2. properties of column-index
		ColIndexStructure colIndexStructure = getColIndexStructure();

		properties = new CustomProperties(mappingProperties, rowIndexStructure, colIndexStructure);
		properties.setNcols(ncols);

		// ref to Table 1:
		if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) {
			// #1
			if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity &&
				colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) {
				Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow();
				properties.setColKeyPatterns(bckpsr.getKey());
				properties.setEndWithValueStrings(bckpsr.getValue());
			}

			// #2
			else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity &&
				colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {
				// find cell-index and value separators
				RawIndex raw = null;
				for(int c = 0; c < ncols; c++) {
					if(mapCol[0][c] != -1) {
						raw = sampleRawIndexes[mapRow[0][c]];
						raw.cloneReservedPositions();
						break;
					}
				}
				if(raw == null)
					throw new DMLRuntimeException("Invalid raw");
				HashMap<String, Long> indexDelimCount = new HashMap<>();
				String valueDelim = null;
				String indexDelim = null;
				Long maxCount = 0L;
				int begin = colIndexStructure.getColIndexBegin();
				for(int c = 0; c < ncols; c++) {
					if(mapCol[0][c] != -1) {
						Pair<Integer, Integer> pair = raw.findValue(c + begin);
						String tmpIndexDelim = raw.getSubString(pair.getKey() + pair.getValue(), mapCol[0][c]);
						if(indexDelimCount.containsKey(tmpIndexDelim))
							indexDelimCount.put(tmpIndexDelim, indexDelimCount.get(tmpIndexDelim) + 1);
						else
							indexDelimCount.put(tmpIndexDelim, 1L);
						if(maxCount < indexDelimCount.get(tmpIndexDelim)) {
							maxCount = indexDelimCount.get(tmpIndexDelim);
							indexDelim = tmpIndexDelim;
						}
						if(valueDelim == null) {
							int nextPos = raw.getNextNumericPosition(mapCol[0][c] + mapLen[0][c]);
							if(nextPos < raw.getRawLength()) {
								valueDelim = raw.getSubString(mapCol[0][c] + mapLen[0][c], nextPos);
							}
						}
					}
				}
				// update properties
				colIndexStructure.setIndexDelim(indexDelim);
				colIndexStructure.setValueDelim(valueDelim);
			}

		}
		else {
			// # 4, 6, 7, 8, 9
			if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist &&
				colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {

				if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) {
					Pair<ArrayList<String>, HashSet<String>> bvkpsr = buildValueKeyPattern();
					HashSet<String>[] endWithValueStrings = new HashSet[1];
					endWithValueStrings[0] = bvkpsr.getValue();
					properties.setValueKeyPattern(bvkpsr.getKey());
					properties.setEndWithValueStrings(endWithValueStrings);
				}

				int beginRowIndex = rowIndexStructure.getRowIndexBegin();
				int beginColIndex = colIndexStructure.getColIndexBegin();
				// build pattern for row-index
				Pair<ArrayList<String>, HashSet<String>> rowIndexPattern = buildIndexKeyPattern(true, beginRowIndex);
				rowIndexStructure.setKeyPattern(rowIndexPattern.getKey());
				rowIndexStructure.setEndWithValueString(rowIndexPattern.getValue());

				// build pattern for col-index
				Pair<ArrayList<String>, HashSet<String>> colIndexPattern = buildIndexKeyPattern(false, beginColIndex);
				colIndexStructure.setKeyPattern(colIndexPattern.getKey());
				colIndexStructure.setEndWithValueString(colIndexPattern.getValue());

			}
			// #10 sequential scattered
			if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter) {
				ArrayList<Pair<String, String>> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false);

				ArrayList<Pair<String, Set<Integer>>> keys;
				TextTrie textTrie = new TextTrie();
				textTrie.insert(prefixSuffixBeginEndCells.get(0).getKey(), 0);
				char startChar = prefixSuffixBeginEndCells.get(0).getKey().charAt(0);

				int minSubStringLength = Math.min(80, prefixSuffixBeginEndCells.get(0).getKey().length());
				for(int i = 1; i < prefixSuffixBeginEndCells.size(); i++) {
					String prefix = prefixSuffixBeginEndCells.get(i).getKey();
					for(int j = 0; j < prefix.length(); j++) {
						if(startChar == prefix.charAt(j))
							textTrie.insert(prefix.substring(j, j + Math.min(minSubStringLength, prefix.length() - j)),
								i);
					}
				}
				// scoring the prefix tree
				keys = textTrie.getAllKeys();
				String beginString = null;
				String endString = null;
				if(keys.get(0).getValue().size() == nrows) {
					int index = keys.get(0).getKey().indexOf("\n");
					if(index == -1)
						beginString = keys.get(0).getKey();
					else
						beginString = keys.get(0).getKey().substring(0, index);

					// recompute suffix strings to find end of string
					int minSuffixStringLength = prefixSuffixBeginEndCells.get(0).getValue().length();
					String reverseBeginString = new StringBuilder(beginString).reverse().toString();
					ArrayList<String> suffixes = new ArrayList<>();
					for(int i = 0; i < prefixSuffixBeginEndCells.size() - 1; i++) {
						String str = new StringBuilder(prefixSuffixBeginEndCells.get(i).getValue()).reverse()
							.toString();
						int indexBeginString = str.indexOf(reverseBeginString);
						if(indexBeginString != -1) {
							for(int j = indexBeginString + reverseBeginString.length(); j < str.length(); j++) {
								if(str.charAt(j) == '\n')
									indexBeginString++;
								else
									break;
							}
							minSuffixStringLength = Math.min(minSuffixStringLength, indexBeginString);
							suffixes.add(new StringBuilder(
								str.substring(0, indexBeginString + reverseBeginString.length())).reverse().toString());
						}
						else
							suffixes.add(str);
					}
					StringBuilder sbEndString = new StringBuilder();
					for(int i = 0; i < minSuffixStringLength; i++) {
						if(suffixes.get(0).length() == 0)
							break;
						char intersectChar = suffixes.get(0).charAt(i);
						if(intersectChar == '\n')
							break;
						boolean flag = true;
						for(String ss : suffixes) {
							if(ss.charAt(i) != intersectChar) {
								flag = false;
								break;
							}
						}
						if(flag)
							sbEndString.append(intersectChar);
						else
							break;
					}
					if(sbEndString.length() == 0)
						endString = beginString;
					else
						endString = sbEndString.toString();
					updateMapsAndExtractAllSuffixStringsOfColsMultiLine(beginString, endString);
					rowIndexStructure.setSeqBeginString(beginString);
					rowIndexStructure.setSeqEndString(endString);

					Pair<ArrayList<String>[], HashSet<String>[]> bckpsr = buildColsKeyPatternSingleRow();
					properties.setColKeyPatterns(bckpsr.getKey());
					properties.setEndWithValueStrings(bckpsr.getValue());
				}
				else {
					// TODO: extend sequential scattered format algorithm for heterogeneous structures
				}
			}
		}

		if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ||
			colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) {
			properties.setSparse(true);
		}
	}