in daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala [540:665]
private def convertToCharRepr(
startingBytePos0b: Long,
endingBytePos0b: Long,
byteBuffer: ByteBuffer,
decoder: Option[JavaCharsetDecoder]
): (String, Int, Int) = {
Assert.invariant(decoder.map { d => Misc.isAsciiBased(d.charset()) }.getOrElse(true))
decoder match {
case Some(dec) => {
val bb = ByteBuffer.allocate(6)
var cb = CharBuffer.allocate(1)
var cr = CoderResult.OVERFLOW
var nConsumedBytes = 0
var remapped = ""
var nCols = 0
val INVALID_CODEPOINT = -1
val lastAvailableBytePos0b = scala.math.min(
endingBytePos0b,
startingBytePos0b + 5
) // widest possible char representation is 6 bytes.
val nBytes = (lastAvailableBytePos0b - startingBytePos0b).toInt + 1
Assert.invariant(nBytes > 0) // have to have at least 1 byte left
(0 until nBytes).foreach { i =>
val thePos = (startingBytePos0b + i).toInt
Assert.invariant(thePos >= 0)
val theByte =
try {
byteBuffer.get(thePos)
} catch {
case e: IndexOutOfBoundsException => 0.toByte
}
bb.put(theByte)
}
bb.flip()
Assert.invariant(bb.remaining > 0)
while (cr.isOverflow && nConsumedBytes == 0 && cb.capacity <= bb.capacity) {
// An overflow means we were able to start to decode at least 1 sequence of characters, but there was either insufficient
// space in the output buffer to store said decoded char or there were left over bytes after parsing. If it is
// the former, we can proceed and we'll get the left over bytes on the next run, if it was the latter
// (as can be the case with decoding a 4 byte character sequence), we will call decode with a larger buffer
// until we consume something or the output buffer is at same capacity as input buffer
cr = dec.decode(bb, cb, true)
nConsumedBytes = bb.position()
if (cr.isOverflow && nConsumedBytes == 0) {
cb = CharBuffer.allocate(cb.capacity + 1)
}
}
// Once we leave the loop, we will either have consumed bytes to process (with a variety of left over bytes that we
// don't care about) or malformed/unmappable results with no consumed bytes that we do care about so we will do a
// manual replace and set consumed bytes ourselves. We should not do an automatic replace as it creates ambiguity
// with the malformed/unmapped/consumed bytes with our current implementation of handling a decoded character at a time.
// We should never have an underflow condition with no bytes consumed. As that would indicate it needs more input than
// we've provided. Even if we only provide 1 byte of a 4 byte sequence, it will return a malformed[1]
Assert.invariant(!(cr.isUnderflow && nConsumedBytes == 0))
if ((cr.isMalformed || cr.isUnmappable) && nConsumedBytes == 0) {
// do manual replacement
remapped = dec.replacement()
// grab malformed/unmappable byte so we can keep decoding
nConsumedBytes = cr.length
nCols = charNColumns(remapped(0))
} else {
// An overflow, at this point, means that we got our one character, but there were more bytes available that could
// be decoded. We're not interested in those right now.
//
// An underflow means that we got our one character, but the bytes were exactly used up
// by constructing that one character.
//
// Either way, we got our one character
Assert.invariant(nConsumedBytes > 0)
Assert.invariant(cb.hasArray)
val allChars = cb.array
val uCodePoint: Int =
if (allChars.length > 1) {
if (UCharacter.isSurrogatePair(allChars(0), allChars(1))) {
UCharacter.getCodePoint(allChars(0), allChars(1))
} else {
INVALID_CODEPOINT
}
} else allChars(0).toInt
val (r: String, n: Int) =
if (allChars.length > 1) {
if (uCodePoint == INVALID_CODEPOINT) {
allChars.map(c => homogenizeChars(c)).foldLeft(("", 0)) {
(accForRemappedAndNcols, tupResultRemappedAndNcols) =>
(
accForRemappedAndNcols._1 + tupResultRemappedAndNcols._1, // concat remapped value for each char
accForRemappedAndNcols._2 + tupResultRemappedAndNcols._2
) // add width value for each char
}
} else {
homogenizeChars(uCodePoint)
}
} else {
homogenizeChars(Misc.remapControlOrLineEndingToVisibleGlyphs(allChars(0)))
}
remapped = r
nCols = n
}
(remapped, nConsumedBytes, nCols)
}
case None => {
// no encoding, so use the general one based on windows-1252 where
// every byte corresponds to a character with a glyph.
val byteValue =
try {
byteBuffer.get(startingBytePos0b.toInt)
} catch {
case e: IndexOutOfBoundsException => 0.toByte
}
// decoding using a decoder might produce C0 or C1 control characters or
// other whitespace characters. But we want visible glyphs no matter what for those.
//
// FIXME: This will be really broken for EBCDIC-based encodings. Pass the encoding
// so that the glyph routine can be ascii/ebcdic sensitive.
val remapped = Misc.remapOneByteToVisibleGlyph(byteValue)
(remapped.toString, 1, 1)
}
}
}