in daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/PUARemappers.scala [46:104]
override protected def remap(prev: Char, curr: Char, next: Char): Int = {
val res: Int = curr match {
case 0x9 => curr
case 0xa => curr
case 0xd =>
if (next == 0xa) {
// CRLF case.
if (replaceCRWithLF)
-0xa // CRLF => LF, standard XML behavior. Note negated.
else
0xe00d // remap CR to preserve it. Leave LF alone.
} else {
// isolated CR case
if (replaceCRWithLF)
0xa // isolated CR => LF, standard XML behavior. Note NOT negated.
else
0xe00d // remap isolated CR to preserve it.
}
case _ if (curr < 0x20) => curr + 0xe000 // ascii c0 controls
// no remapping for the so called C1 controls (0x80-0x9F) Those are not XML illegal.
case _ if Character.isSurrogate(curr) => {
if (
(Character.isHighSurrogate(curr) && Character.isLowSurrogate(next)) ||
(Character.isLowSurrogate(curr) && Character.isHighSurrogate(prev))
) {
// well formed surrogate pairs are preserved
curr
} else {
// curr is an isolated surrogate, so to preserve we must remap to PUA
curr + 0x1000
}
}
case _ if (curr >= 0xe000 && curr <= 0xf8ff) => { // Unicode PUA is E000 to F8FF.
if (checkForExistingPUA)
throw new RemapPUACharDetected(curr)
else curr
}
case _ if (curr < 0xfffe) => curr
// 0xFFFE and 0xFFFF are regular Unicode chars, but XML illegal.
// (XML only allows up to 0xFFFD)
// They can't remap into the PUA by the basic techniques of adding
// 0xE000 or 0x1000 like with control chars or unpaired surrogate code points.
// So we just pick two adhoc, but recognizable, PUA code points to use by subtracting
// 0x0F00 from them.
case 0xfffe =>
0xf0fe // U+FFFE is not a legal XML char. Can't remap to PUA the regular way.
case 0xffff => 0xf0ff // U+FFFF is not a legal XML char
case bad =>
// $COVERAGE-OFF$
// This is a final class, so this only gets called with characters by the
// base class remap(s: String) method. Those chars are only
// taken from Scala/Java strings, hence, the char codes cannot be beyond 0xFFFF
Assert.impossibleCase(
"Scala/Java character code cannot be beyond 0xFFFF but was 0x%40X".format(bad)
)
// $COVERAGE-ON$
}
res
}