private def convertToCharRepr()

in daffodil-io/src/main/scala/org/apache/daffodil/io/Dump.scala [540:665]


  private def convertToCharRepr(
    startingBytePos0b: Long,
    endingBytePos0b: Long,
    byteBuffer: ByteBuffer,
    decoder: Option[JavaCharsetDecoder]
  ): (String, Int, Int) = {

    Assert.invariant(decoder.map { d => Misc.isAsciiBased(d.charset()) }.getOrElse(true))
    decoder match {
      case Some(dec) => {
        val bb = ByteBuffer.allocate(6)
        var cb = CharBuffer.allocate(1)
        var cr = CoderResult.OVERFLOW
        var nConsumedBytes = 0
        var remapped = ""
        var nCols = 0
        val INVALID_CODEPOINT = -1
        val lastAvailableBytePos0b = scala.math.min(
          endingBytePos0b,
          startingBytePos0b + 5
        ) // widest possible char representation is 6 bytes.
        val nBytes = (lastAvailableBytePos0b - startingBytePos0b).toInt + 1
        Assert.invariant(nBytes > 0) // have to have at least 1 byte left
        (0 until nBytes).foreach { i =>
          val thePos = (startingBytePos0b + i).toInt
          Assert.invariant(thePos >= 0)
          val theByte =
            try {
              byteBuffer.get(thePos)
            } catch {
              case e: IndexOutOfBoundsException => 0.toByte
            }
          bb.put(theByte)
        }
        bb.flip()

        Assert.invariant(bb.remaining > 0)
        while (cr.isOverflow && nConsumedBytes == 0 && cb.capacity <= bb.capacity) {
          // An overflow means we were able to start to decode at least 1 sequence of characters, but there was either insufficient
          // space in the output buffer to store said decoded char or there were left over bytes after parsing. If it is
          // the former, we can proceed and we'll get the left over bytes on the next run, if it was the latter
          // (as can be the case with decoding a 4 byte character sequence), we will call decode with a larger buffer
          // until we consume something or the output buffer is at same capacity as input buffer
          cr = dec.decode(bb, cb, true)
          nConsumedBytes = bb.position()
          if (cr.isOverflow && nConsumedBytes == 0) {
            cb = CharBuffer.allocate(cb.capacity + 1)
          }
        }

        // Once we leave the loop, we will either have consumed bytes to process (with a variety of left over bytes that we
        // don't care about) or malformed/unmappable results with no consumed bytes that we do care about so we will do a
        // manual replace and set consumed bytes ourselves. We should not do an automatic replace as it creates ambiguity
        // with the malformed/unmapped/consumed bytes with our current implementation of handling a decoded character at a time.

        // We should never have an underflow condition with no bytes consumed. As that would indicate it needs more input than
        // we've provided. Even if we only provide 1 byte of a 4 byte sequence, it will return a malformed[1]
        Assert.invariant(!(cr.isUnderflow && nConsumedBytes == 0))

        if ((cr.isMalformed || cr.isUnmappable) && nConsumedBytes == 0) {
          // do manual replacement
          remapped = dec.replacement()
          // grab malformed/unmappable byte so we can keep decoding
          nConsumedBytes = cr.length
          nCols = charNColumns(remapped(0))
        } else {
          // An overflow, at this point, means that we got our one character, but there were more bytes available that could
          // be decoded. We're not interested in those right now.
          //
          // An underflow means that we got our one character, but the bytes were exactly used up
          // by constructing that one character.
          //
          // Either way, we got our one character
          Assert.invariant(nConsumedBytes > 0)
          Assert.invariant(cb.hasArray)
          val allChars = cb.array

          val uCodePoint: Int =
            if (allChars.length > 1) {
              if (UCharacter.isSurrogatePair(allChars(0), allChars(1))) {
                UCharacter.getCodePoint(allChars(0), allChars(1))
              } else {
                INVALID_CODEPOINT
              }
            } else allChars(0).toInt

          val (r: String, n: Int) =
            if (allChars.length > 1) {
              if (uCodePoint == INVALID_CODEPOINT) {
                allChars.map(c => homogenizeChars(c)).foldLeft(("", 0)) {
                  (accForRemappedAndNcols, tupResultRemappedAndNcols) =>
                    (
                      accForRemappedAndNcols._1 + tupResultRemappedAndNcols._1, // concat remapped value for each char
                      accForRemappedAndNcols._2 + tupResultRemappedAndNcols._2
                    ) // add width value for each char
                }
              } else {
                homogenizeChars(uCodePoint)
              }
            } else {
              homogenizeChars(Misc.remapControlOrLineEndingToVisibleGlyphs(allChars(0)))
            }
          remapped = r
          nCols = n
        }
        (remapped, nConsumedBytes, nCols)
      }
      case None => {
        // no encoding, so use the general one based on windows-1252 where
        // every byte corresponds to a character with a glyph.
        val byteValue =
          try {
            byteBuffer.get(startingBytePos0b.toInt)
          } catch {
            case e: IndexOutOfBoundsException => 0.toByte
          }
        // decoding using a decoder might produce C0 or C1 control characters or
        // other whitespace characters. But we want visible glyphs no matter what for those.
        //
        // FIXME: This will be really broken for EBCDIC-based encodings. Pass the encoding
        // so that the glyph routine can be ascii/ebcdic sensitive.
        val remapped = Misc.remapOneByteToVisibleGlyph(byteValue)
        (remapped.toString, 1, 1)
      }
    }
  }