in daffodil-lib/src/main/scala/org/apache/daffodil/lib/util/Misc.scala [486:543]
def remapControlOrLineEndingToVisibleGlyphs(
c: Char,
replaceControlPictures: Boolean = false
): Char = {
val URC =
0x2426 // Unicode control picture character for substitution (also looks like arabic q-mark)
val code: Int = c.toInt match {
//
// C0 Control pictures
case n if (n <= 0x1f) => n + 0x2400
case 0x20 => 0x2423 // For space we use the SP we use the ␣ (Unicode OPEN BOX)
case 0x7f => 0x2421 // DEL pic isn't at 0x247F, it's at 0x2421
//
// We remap these into the Unicode Latin Extended B codepoints by
// adding 0x100 to their basic value.
//
case n if (n >= 0x80 && n <= 0x9f) =>
n + 0x100
case 0xa0 => 0x2422 // non-break space => ␢ (blank symbol or little b with stroke)
case 0xad => 0x002d // soft hyphen => hyphen
//
// Unicode separators & joiners
case 0x200b => URC // zero width space
case 0x2028 => URC // line separator
case 0x2029 => URC // paragraph separator
case 0x200c => URC // zero width non-joiner
case 0x200d => URC // zero width joiner
case 0x2060 => URC // word joiner
// bi-di controls
case 0x200e | 0x200f => URC
case b if (b >= 0x202a && b <= 0x202e) => URC
// byte order mark
case 0xfffe => URC // ZWNBS aka Byte Order Mark
case 0xffff => URC // non-character FFFF
// we assume surrogate codepoints all have a glyph (depends on font used of course)
//
// TODO: this could go on and on. There's a flock of 'space' characters (EM SPACE)
// all over the place in Unicode.
//
// TODO: combining characters,
// all whitespace, zero-width, and combining/joining characters would be
// represented by a separate glyph-character.
//
// Probably could be done by checking the character against some
// unicode regex character classes like \p{M} which is the class
// of combining mark characters
//
//
// Special case - if incoming character is one of the glyph
// characters we're remapping onto, then we could issue
// a substitution character, but there are things that depend
// on these being preserved. So we have a flag to control this.
//
case n if (n > 0x2400 && n < 0x2423 && replaceControlPictures) => URC
case x => x
}
code.toChar
}