in athena-udfs-textanalytics/src/main/java/com/amazonaws/athena/udf/textanalytics/TextAnalyticsUDFHandler.java [1061:1100]
private List<Object[]> getBatches(String[] input, int multiRowBatchSize, int maxTextBytes, boolean splitLongText)
throws Exception
{
List<Object[]> batches = new ArrayList<Object[]>();
int start = 0;
int c = 0;
for (int i = 0; i < input.length; i++) {
if (c++ >= multiRowBatchSize) {
// add a batch (not including current row), and reset c
batches.add(new Object[] {Arrays.copyOfRange(input, start, i), "MULTI_ROW_BATCH"});
start = i;
c = 1;
}
int textLength = getUtf8StringLength(input[i]);
boolean tooLong = (textLength >= maxTextBytes) ? true : false;
if (tooLong && !splitLongText) {
// truncate this row
System.out.println("Truncating long text field (" + textLength + " bytes) to " + maxTextBytes + " bytes");
input[i] = truncateUtf8(input[i], maxTextBytes);
}
if (tooLong && splitLongText) {
// close off current multi-record batch before making new single record batch
if (start < i) {
batches.add(new Object[] {Arrays.copyOfRange(input, start, i), "MULTI_ROW_BATCH"});
}
// split this row and add the text splits as a new *TEXT_SPLIT_BATCH* batch
String[] textSplit = splitLongText(input[i], maxTextBytes);
System.out.println("Split long text field (" + textLength + " bytes) into " + textSplit.length + " segments of under " + maxTextBytes + " bytes");
batches.add(new Object[] {textSplit, "TEXT_SPLIT_BATCH"});
// increment counters for next row / next batch
start = i + 1;
c = 1;
}
}
// last multi-record split
if (start < input.length) {
batches.add(new Object[] {Arrays.copyOfRange(input, start, input.length), "MULTI_ROW_BATCH"});
}
return batches;
}