in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java [388:534]
private void internalProcessRecord(Record record)
throws SAXException, TikaException, IOException {
switch (record.getSid()) {
case BOFRecord.sid: // start of workbook, worksheet etc. records
BOFRecord bof = (BOFRecord) record;
if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
currentSheetIndex = -1;
} else if (bof.getType() == BOFRecord.TYPE_CHART) {
if (previousSid == EOFRecord.sid) {
// This is a sheet which contains only a chart
newSheet();
} else {
// This is a chart within a normal sheet
// Handling of this is a bit hacky...
if (currentSheet != null) {
processSheet();
currentSheetIndex--;
newSheet();
}
}
} else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
newSheet();
}
break;
case EOFRecord.sid: // end of workbook, worksheet etc. records
if (currentSheet != null) {
processSheet();
}
currentSheet = null;
break;
case BoundSheetRecord.sid: // Worksheet index record
BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
sheetNames.add(boundSheetRecord.getSheetname());
break;
case SSTRecord.sid: // holds all the strings for LabelSSTRecords
sstRecord = (SSTRecord) record;
break;
case FormulaRecord.sid: // Cell value from a formula
FormulaRecord formula = (FormulaRecord) record;
if (formula.hasCachedResultString()) {
// The String itself should be the next record
stringFormulaRecord = formula;
} else {
addTextCell(record, formatListener.formatNumberDateCell(formula));
}
break;
case StringRecord.sid:
if (previousSid == FormulaRecord.sid) {
// Cached string value of a string formula
StringRecord sr = (StringRecord) record;
addTextCell(stringFormulaRecord, sr.getString());
} else {
// Some other string not associated with a cell, skip
}
break;
case LabelRecord.sid: // strings stored directly in the cell
LabelRecord label = (LabelRecord) record;
addTextCell(record, label.getValue());
break;
case LabelSSTRecord.sid: // Ref. a string in the shared string table
LabelSSTRecord sst = (LabelSSTRecord) record;
UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
String cellString = null;
if (officeParserConfig.isConcatenatePhoneticRuns()) {
String phonetic = (unicode != null && unicode.getExtendedRst() != null &&
unicode.getExtendedRst().getPhoneticText() != null &&
!unicode.getExtendedRst().getPhoneticText().isBlank()) ?
unicode.getExtendedRst().getPhoneticText() : "";
cellString = unicode.getString() + " " + phonetic;
} else {
cellString = unicode.getString();
}
addTextCell(record, cellString);
break;
case NumberRecord.sid: // Contains a numeric cell value
NumberRecord number = (NumberRecord) record;
addTextCell(record, formatListener.formatNumberDateCell(number));
break;
case RKRecord.sid: // Excel internal number record
RKRecord rk = (RKRecord) record;
addCell(record, new NumberCell(rk.getRKNumber(), format));
break;
case HyperlinkRecord.sid: // holds a URL associated with a cell
if (currentSheet != null) {
HyperlinkRecord link = (HyperlinkRecord) record;
Point point = new Point(link.getFirstColumn(), link.getFirstRow());
Cell cell = currentSheet.get(point);
if (cell != null) {
String address = link.getAddress();
if (address != null) {
addCell(record, new LinkedCell(cell, address));
} else {
addCell(record, cell);
}
}
}
break;
case TextObjectRecord.sid:
if (extractor.officeParserConfig.isIncludeShapeBasedContent()) {
TextObjectRecord tor = (TextObjectRecord) record;
addTextCell(record, tor.getStr().getString());
}
break;
case SeriesTextRecord.sid: // Chart label or title
SeriesTextRecord str = (SeriesTextRecord) record;
addTextCell(record, str.getText());
break;
case DrawingGroupRecord.sid:
// Collect this now, we'll process later when all
// the continue records are in
drawingGroups.add((DrawingGroupRecord) record);
break;
case HeaderRecord.sid:
if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
HeaderRecord headerRecord = (HeaderRecord) record;
addTextCell(record, headerRecord.getText());
}
break;
case FooterRecord.sid:
if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) {
FooterRecord footerRecord = (FooterRecord) record;
addTextCell(record, footerRecord.getText());
}
break;
}
previousSid = record.getSid();
if (stringFormulaRecord != record) {
stringFormulaRecord = null;
}
}