opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java [349:400]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - private List tokenize(final String text) { final List t = new LinkedList<>(); // In this article as the paper suggests, we are going to segment the input into smaller text and feed // each of them into BERT, it means for each row, we will split the text in order to have some // smaller text (200 words long each) // https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd // Split the input text into 200 word chunks with 50 overlapping between chunks. final String[] whitespaceTokenized = text.split("\\s+"); for (int start = 0; start < whitespaceTokenized.length; start = start + inferenceOptions.getDocumentSplitSize()) { // 200 word length chunk // Check the end do don't go past and get a StringIndexOutOfBoundsException int end = start + inferenceOptions.getDocumentSplitSize(); if (end > whitespaceTokenized.length) { end = whitespaceTokenized.length; } // The group is that subsection of string. final String group = String.join(" ", Arrays.copyOfRange(whitespaceTokenized, start, end)); // We want to overlap each chunk by 50 words so scoot back 50 words for the next iteration. start = start - inferenceOptions.getSplitOverlapSize(); // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); final int[] ids = new int[tokens.length]; for (int x = 0; x < tokens.length; x++) { ids[x] = vocab.get(tokens[x]); } final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray(); final long[] mask = new long[ids.length]; Arrays.fill(mask, 1); final long[] types = new long[ids.length]; Arrays.fill(types, 0); t.add(new Tokens(tokens, lids, mask, types)); } return t; } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java [261:312]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - private List tokenize(final String text) { final List t = new LinkedList<>(); // In this article as the paper suggests, we are going to segment the input into smaller text and feed // each of them into BERT, it means for each row, we will split the text in order to have some // smaller text (200 words long each) // https://medium.com/analytics-vidhya/text-classification-with-bert-using-transformers-for-long-text-inputs-f54833994dfd // Split the input text into 200 word chunks with 50 overlapping between chunks. final String[] whitespaceTokenized = text.split("\\s+"); for (int start = 0; start < whitespaceTokenized.length; start = start + inferenceOptions.getDocumentSplitSize()) { // 200 word length chunk // Check the end do don't go past and get a StringIndexOutOfBoundsException int end = start + inferenceOptions.getDocumentSplitSize(); if (end > whitespaceTokenized.length) { end = whitespaceTokenized.length; } // The group is that subsection of string. final String group = String.join(" ", Arrays.copyOfRange(whitespaceTokenized, start, end)); // We want to overlap each chunk by 50 words so scoot back 50 words for the next iteration. start = start - inferenceOptions.getSplitOverlapSize(); // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); final int[] ids = new int[tokens.length]; for (int x = 0; x < tokens.length; x++) { ids[x] = vocab.get(tokens[x]); } final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray(); final long[] mask = new long[ids.length]; Arrays.fill(mask, 1); final long[] types = new long[ids.length]; Arrays.fill(types, 0); t.add(new Tokens(tokens, lids, mask, types)); } return t; } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -