in community/document-parsers/spring-ai-alibaba-starter-document-parser-bibtex/src/main/java/com/alibaba/cloud/ai/parser/bibtex/BibtexDocumentParser.java [90:161]
public List<Document> parse(InputStream inputStream) {
try (Reader reader = new InputStreamReader(inputStream, charsetName)) {
List<Document> documentList = new ArrayList<>(10);
BibTeXParser bibtexParser = new BibTeXParser();
BibTeXDatabase database = bibtexParser.parse(reader);
Map<Key, BibTeXEntry> entries = database.getEntries();
if (entries.isEmpty()) {
return documentList;
}
if (maxDocs != null && maxDocs > 0 && entries.size() > maxDocs) {
entries = entries.entrySet()
.stream()
.limit(maxDocs)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
(existing, replacement) -> existing));
}
for (BibTeXEntry entry : entries.values()) {
Map<String, Object> metadata = new HashMap<>();
metadata.put(entry.getType().getValue(), entry.getKey());
for (Key key : entry.getFields().keySet()) {
Value value = entry.getFields().get(key);
metadata.put(key.getValue(), value.toUserString());
}
List<String> fileNames = new ArrayList<>();
if (metadata.containsKey("file")) {
String fileValue = metadata.get("file").toString();
if (!Objects.isNull(filePattern)) {
Matcher matcher = filePattern.matcher(metadata.get("file").toString());
while (matcher.find()) {
fileNames.add(matcher.group());
}
}
else {
Collections.addAll(fileNames, fileValue.split("[;,\\s]+"));
}
}
StringBuilder content = new StringBuilder(metadata.getOrDefault("abstract", "").toString());
if (!fileNames.isEmpty()) {
for (String fileName : fileNames) {
try (InputStream fileInputStream = new DefaultResourceLoader()
.getResource("classpath:/" + fileName)
.getInputStream()) {
List<Document> docs = parser.parse(fileInputStream);
if (!docs.isEmpty()) {
content.append(docs.get(0).getText());
}
}
catch (IOException e) {
// Log the exception and continue with the next file
logger.warn("Failed to read file: {}", fileName, e);
}
}
}
if (maxContentChars != null && maxContentChars > 0) {
int endIndex = Math.min(maxContentChars, content.length());
content = new StringBuilder(content.substring(0, endIndex));
}
Document document = new Document(content.toString(), metadata);
documentList.add(document);
}
return documentList;
}
catch (Exception e) {
logger.error("Error parsing input stream", e);
throw new RuntimeException("Error parsing input stream", e);
}
}