private List getBatches()

in athena-udfs-textanalytics/src/main/java/com/amazonaws/athena/udf/textanalytics/TextAnalyticsUDFHandler.java [1103:1147]


    private List<Object[]> getBatches(String[] input, String[] languageCodes, int multiRowBatchSize, int maxTextBytes, boolean splitLongText)
        throws Exception
    {
        List<Object[]> batches = new ArrayList<Object[]>();
        String languageCode = languageCodes[0];
        int start = 0;
        int c = 0;
        for (int i = 0; i < input.length; i++) {
            if (c++ >= multiRowBatchSize || ! languageCode.equals(languageCodes[i])) {
                // add a batch (not including current row), and reset c
                batches.add(new Object[] {Arrays.copyOfRange(input, start, i), "MULTI_ROW_BATCH", languageCode});
                languageCode = languageCodes[i];
                start = i;
                c = 1;
            }
            int textLength = getUtf8StringLength(input[i]);
            boolean tooLong = (textLength > maxTextBytes) ? true : false;
            if (tooLong && !splitLongText) {
                // truncate this row
                System.out.println("Truncating long text field (" + textLength + " bytes) to " + maxTextBytes + " bytes");
                input[i] = truncateUtf8(input[i], maxTextBytes);
            }
            if (tooLong && splitLongText) {
                // close off current multi-record batch before making new single record batch
                if (start < i) {
                    batches.add(new Object[] {Arrays.copyOfRange(input, start, i), "MULTI_ROW_BATCH", languageCode});
                }
                // split this row and add the text splits as a new *TEXT_SPLIT_BATCH* batch
                String[] textSplit = splitLongText(input[i], maxTextBytes);
                System.out.println("Split long text field (" + textLength + " bytes) into " + textSplit.length + " segments of under " + maxTextBytes + " bytes");
                batches.add(new Object[] {textSplit, "TEXT_SPLIT_BATCH", languageCode});
                // increment counters for next row / next batch
                start = i + 1;
                c = 1;
                if (i < input.length) {
                    languageCode = languageCodes[i];
                }
            } 
        }
        // last multi-record split
        if (start < input.length) {
            batches.add(new Object[] {Arrays.copyOfRange(input, start, input.length), "MULTI_ROW_BATCH", languageCode});
        }
        return batches;          
    }