in pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java [96:319]
public Object parseNextToken() throws IOException
{
if (source.isClosed())
{
return null;
}
skipSpaces();
if (source.isEOF())
{
close();
return null;
}
char c = (char) source.peek();
switch (c)
{
case '<':
// pull off first left bracket
source.read();
// check for second left bracket
c = (char) source.peek();
// put back first bracket
source.rewind(1);
if (c == '<')
{
try
{
return parseCOSDictionary(true);
}
catch (IOException exception)
{
LOG.warn("Stop reading invalid dictionary from content stream at offset {}",
source.getPosition());
close();
return null;
}
}
else
{
return parseCOSString();
}
case '[':
// array
try
{
return parseCOSArray();
}
catch (IOException exception)
{
LOG.warn("Stop reading invalid array from content stream at offset {}",
source.getPosition());
close();
return null;
}
case '(':
// string
return parseCOSString();
case '/':
// name
return parseCOSName();
case 'n':
// null
String nullString = readString();
if( nullString.equals( "null") )
{
return COSNull.NULL;
}
else
{
return Operator.getOperator(nullString);
}
case 't':
case 'f':
String next = readString();
if( next.equals( "true" ) )
{
return COSBoolean.TRUE;
}
else if( next.equals( "false" ) )
{
return COSBoolean.FALSE;
}
else
{
return Operator.getOperator(next);
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
case '+':
case '.':
/* We will be filling buf with the rest of the number. Only
* allow 1 "." and "-" and "+" at start of number. */
StringBuilder buf = new StringBuilder();
buf.append( c );
source.read();
// Ignore double negative (this is consistent with Adobe Reader)
if (c == '-' && source.peek() == c)
{
source.read();
}
boolean dotNotRead = c != '.';
while (Character.isDigit(c = (char) source.peek()) || dotNotRead && c == '.'
|| c == '-')
{
if (c != '-')
{
// PDFBOX-4064: ignore "-" in the middle of a number
buf.append(c);
}
source.read();
if (dotNotRead && c == '.')
{
dotNotRead = false;
}
}
String s = buf.toString();
if ("+".equals(s))
{
// PDFBOX-5906
LOG.warn("isolated '+' is ignored");
return COSNull.NULL;
}
return COSNumber.get(s);
case 'B':
String nextOperator = readString();
Operator beginImageOP = Operator.getOperator(nextOperator);
if (nextOperator.equals(OperatorName.BEGIN_INLINE_IMAGE))
{
COSDictionary imageParams = new COSDictionary();
beginImageOP.setImageParameters( imageParams );
Object nextToken = null;
while( (nextToken = parseNextToken()) instanceof COSName )
{
Object value = parseNextToken();
if (!(value instanceof COSBase))
{
LOG.warn("Unexpected token in inline image dictionary at offset {}",
source.isClosed() ? "EOF" : source.getPosition());
break;
}
imageParams.setItem( (COSName)nextToken, (COSBase)value );
}
//final token will be the image data, maybe??
if (nextToken instanceof Operator)
{
Operator imageData = (Operator) nextToken;
if (imageData.getImageData() == null || imageData.getImageData().length == 0)
{
LOG.warn("empty inline image at stream offset {}",
source.getPosition());
}
beginImageOP.setImageData(imageData.getImageData());
}
}
return beginImageOP;
case 'I':
//Special case for ID operator
String id = Character.toString((char) source.read()) + (char) source.read();
if (!id.equals(OperatorName.BEGIN_INLINE_IMAGE_DATA))
{
long currentPosition = source.getPosition();
close();
throw new IOException( "Error: Expected operator 'ID' actual='" + id +
"' at stream offset " + currentPosition);
}
ByteArrayOutputStream imageData = new ByteArrayOutputStream();
if( isWhitespace() )
{
//pull off the whitespace character
source.read();
}
int lastByte = source.read();
int currentByte = source.read();
// PDF spec is kinda unclear about this. Should a whitespace
// always appear before EI? Not sure, so that we just read
// until EI<whitespace>.
// Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561
while( !(lastByte == 'E' &&
currentByte == 'I' &&
hasNextSpaceOrReturn() &&
hasNoFollowingBinData()) &&
!isEOF())
{
imageData.write( lastByte );
lastByte = currentByte;
currentByte = source.read();
}
// the EI operator isn't unread, as it won't be processed anyway
Operator beginImageDataOP = Operator
.getOperator(OperatorName.BEGIN_INLINE_IMAGE_DATA);
// save the image data to the operator, so that it can be accessed later
beginImageDataOP.setImageData(imageData.toByteArray());
return beginImageDataOP;
case ']':
// some ']' around without its previous '['
// this means a PDF is somewhat corrupt but we will continue to parse.
source.read();
// must be a better solution than null...
return COSNull.NULL;
default:
// we must be an operator
String operator = readOperator().trim();
if (!operator.isEmpty())
{
return Operator.getOperator(operator);
}
}
return null;
}