in flink-ml-lib/src/main/java/org/apache/flink/ml/feature/stringindexer/StringIndexer.java [239:293]
public StringIndexerModelData map(Map<String, Long>[] value) {
int numCols = value.length;
String[][] stringArrays = new String[numCols][];
ArrayList<Tuple2<String, Long>> stringsAndCnts = new ArrayList<>();
for (int i = 0; i < numCols; i++) {
stringsAndCnts.clear();
stringsAndCnts.ensureCapacity(value[i].size());
for (Map.Entry<String, Long> entry : value[i].entrySet()) {
stringsAndCnts.add(Tuple2.of(entry.getKey(), entry.getValue()));
}
switch (stringOrderType) {
case ALPHABET_ASC_ORDER:
stringsAndCnts.sort(Comparator.comparing(valAndCnt -> valAndCnt.f0));
break;
case ALPHABET_DESC_ORDER:
stringsAndCnts.sort(
(valAndCnt1, valAndCnt2) ->
-valAndCnt1.f0.compareTo(valAndCnt2.f0));
break;
case FREQUENCY_ASC_ORDER:
stringsAndCnts.sort(Comparator.comparing(valAndCnt -> valAndCnt.f1));
break;
case FREQUENCY_DESC_ORDER:
stringsAndCnts.sort(
(valAndCnt1, valAndCnt2) ->
-valAndCnt1.f1.compareTo(valAndCnt2.f1));
if (stringsAndCnts.size() > maxIndexNum) {
ArrayList<Tuple2<String, Long>> frequentStringsAndCnts =
new ArrayList<>();
// Reserves the last index for infrequent element.
frequentStringsAndCnts.ensureCapacity(maxIndexNum - 1);
for (int indexId = 0; indexId < maxIndexNum - 1; indexId++) {
frequentStringsAndCnts.add(stringsAndCnts.get(indexId));
}
stringsAndCnts = frequentStringsAndCnts;
}
break;
case ARBITRARY_ORDER:
break;
default:
throw new UnsupportedOperationException(
"Unsupported "
+ STRING_ORDER_TYPE
+ " type: "
+ stringOrderType
+ ".");
}
stringArrays[i] = stringsAndCnts.stream().map(x -> x.f0).toArray(String[]::new);
}
return new StringIndexerModelData(stringArrays);
}