def remapControlOrLineEndingToVisibleGlyphs()

in daffodil-lib/src/main/scala/org/apache/daffodil/lib/util/Misc.scala [486:543]


  def remapControlOrLineEndingToVisibleGlyphs(
    c: Char,
    replaceControlPictures: Boolean = false
  ): Char = {
    val URC =
      0x2426 // Unicode control picture character for substitution (also looks like arabic q-mark)
    val code: Int = c.toInt match {
      //
      // C0 Control pictures
      case n if (n <= 0x1f) => n + 0x2400
      case 0x20 => 0x2423 // For space we use the SP we use the ␣ (Unicode OPEN BOX)
      case 0x7f => 0x2421 // DEL pic isn't at 0x247F, it's at 0x2421
      //
      // We remap these into the Unicode Latin Extended B codepoints by
      // adding 0x100 to their basic value.
      //
      case n if (n >= 0x80 && n <= 0x9f) =>
        n + 0x100
      case 0xa0 => 0x2422 // non-break space => ␢ (blank symbol or little b with stroke)
      case 0xad => 0x002d // soft hyphen => hyphen
      //
      // Unicode separators & joiners
      case 0x200b => URC // zero width space
      case 0x2028 => URC // line separator
      case 0x2029 => URC // paragraph separator
      case 0x200c => URC // zero width non-joiner
      case 0x200d => URC // zero width joiner
      case 0x2060 => URC // word joiner
      // bi-di controls
      case 0x200e | 0x200f => URC
      case b if (b >= 0x202a && b <= 0x202e) => URC
      // byte order mark
      case 0xfffe => URC // ZWNBS aka Byte Order Mark
      case 0xffff => URC // non-character FFFF
      // we assume surrogate codepoints all have a glyph (depends on font used of course)
      //
      // TODO: this could go on and on. There's a flock of 'space' characters (EM SPACE)
      // all over the place in Unicode.
      //
      // TODO: combining characters,
      // all whitespace, zero-width, and combining/joining characters would be
      // represented by a separate glyph-character.
      //
      // Probably could be done by checking the character against some
      // unicode regex character classes like \p{M} which is the class
      // of combining mark characters
      //
      //
      // Special case - if incoming character is one of the glyph
      // characters we're remapping onto, then we could issue
      // a substitution character, but there are things that depend
      // on these being preserved. So we have a flag to control this.
      //
      case n if (n > 0x2400 && n < 0x2423 && replaceControlPictures) => URC
      case x => x
    }
    code.toChar
  }