rust/fury-core/src/meta/meta_string.rs (384 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use anyhow::anyhow;
use crate::ensure;
use crate::error::Error;
use crate::meta::string_util;
// equal to "std::i16::MAX"
const SHORT_MAX_VALUE: usize = 32767;
#[derive(Debug, PartialEq)]
pub enum Encoding {
Utf8 = 0x00,
LowerSpecial = 0x01,
LowerUpperDigitSpecial = 0x02,
FirstToLowerSpecial = 0x03,
AllToLowerSpecial = 0x04,
}
#[derive(Debug, PartialEq)]
pub struct MetaString {
pub original: String,
pub encoding: Encoding,
pub bytes: Vec<u8>,
pub strip_last_char: bool,
}
impl MetaString {
pub fn new(original: String, encoding: Encoding, bytes: Vec<u8>) -> Result<Self, Error> {
let mut strip_last_char = false;
if encoding != Encoding::Utf8 {
ensure!(!bytes.is_empty(), anyhow!("Encoded data cannot be empty"));
strip_last_char = (bytes[0] & 0x80) != 0;
}
Ok(MetaString {
original,
encoding,
bytes,
strip_last_char,
})
}
}
pub struct MetaStringDecoder {}
impl Default for MetaStringDecoder {
fn default() -> Self {
Self::new()
}
}
pub struct MetaStringEncoder {}
impl Default for MetaStringEncoder {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
struct StringStatistics {
digit_count: usize,
upper_count: usize,
can_lower_upper_digit_special_encoded: bool,
can_lower_special_encoded: bool,
}
impl MetaStringEncoder {
pub fn new() -> Self {
MetaStringEncoder {}
}
fn is_latin(&self, s: &str) -> bool {
string_util::is_latin(s)
}
pub fn encode(&self, input: &str) -> Result<MetaString, Error> {
if input.is_empty() {
return MetaString::new(input.to_string(), Encoding::Utf8, vec![]);
}
ensure!(
input.len() < SHORT_MAX_VALUE,
anyhow!(
"Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
input.len()
)
);
if !self.is_latin(input) {
return MetaString::new(input.to_string(), Encoding::Utf8, input.as_bytes().to_vec());
}
let encoding = self.compute_encoding(input);
self.encode_with_encoding(input, encoding)
}
fn compute_encoding(&self, input: &str) -> Encoding {
let statistics = self.compute_statistics(input);
if statistics.can_lower_special_encoded {
return Encoding::LowerSpecial;
}
if statistics.can_lower_upper_digit_special_encoded {
if statistics.digit_count != 0 {
return Encoding::LowerUpperDigitSpecial;
}
let upper_count: usize = statistics.upper_count;
if upper_count == 1 && input.chars().next().unwrap().is_uppercase() {
return Encoding::FirstToLowerSpecial;
}
if ((input.len() + upper_count) * 5) < (input.len() * 6) {
return Encoding::AllToLowerSpecial;
}
return Encoding::LowerUpperDigitSpecial;
}
Encoding::Utf8
}
fn compute_statistics(&self, chars: &str) -> StringStatistics {
let mut can_lower_upper_digit_special_encoded = true;
let mut can_lower_special_encoded = true;
let mut digit_count = 0;
let mut upper_count = 0;
for c in chars.chars() {
if can_lower_upper_digit_special_encoded
&& !(c.is_lowercase()
|| c.is_uppercase()
|| c.is_ascii_digit()
|| c == '.'
|| c == '_')
{
can_lower_upper_digit_special_encoded = false;
}
if can_lower_special_encoded
&& !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|'))
{
can_lower_special_encoded = false;
}
if c.is_ascii_digit() {
digit_count += 1;
}
if c.is_uppercase() {
upper_count += 1;
}
}
StringStatistics {
digit_count,
upper_count,
can_lower_upper_digit_special_encoded,
can_lower_special_encoded,
}
}
pub fn encode_with_encoding(
&self,
input: &str,
encoding: Encoding,
) -> Result<MetaString, Error> {
if input.is_empty() {
return MetaString::new(input.to_string(), Encoding::Utf8, vec![]);
}
ensure!(
input.len() < SHORT_MAX_VALUE,
anyhow!(
"Meta string is too long, max:{SHORT_MAX_VALUE}, current:{}",
input.len()
)
);
ensure!(
encoding == Encoding::Utf8 || self.is_latin(input),
anyhow!("Non-ASCII characters in meta string are not allowed")
);
if input.is_empty() {
return MetaString::new(input.to_string(), Encoding::Utf8, vec![]);
};
match encoding {
Encoding::LowerSpecial => {
let encoded_data = self.encode_lower_special(input)?;
MetaString::new(input.to_string(), encoding, encoded_data)
}
Encoding::LowerUpperDigitSpecial => {
let encoded_data = self.encode_lower_upper_digit_special(input)?;
MetaString::new(input.to_string(), encoding, encoded_data)
}
Encoding::FirstToLowerSpecial => {
let encoded_data = self.encode_first_to_lower_special(input)?;
MetaString::new(input.to_string(), encoding, encoded_data)
}
Encoding::AllToLowerSpecial => {
let upper_count = input.chars().filter(|c| c.is_uppercase()).count();
let encoded_data = self.encode_all_to_lower_special(input, upper_count)?;
MetaString::new(input.to_string(), encoding, encoded_data)
}
Encoding::Utf8 => {
let encoded_data = input.as_bytes().to_vec();
MetaString::new(input.to_string(), Encoding::Utf8, encoded_data)
}
}
}
fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result<Vec<u8>, Error> {
let total_bits: usize = input.len() * bits_per_char as usize + 1;
let byte_length: usize = (total_bits + 7) / 8;
let mut bytes = vec![0; byte_length];
let mut current_bit = 1;
for c in input.chars() {
let value = self.char_to_value(c, bits_per_char)?;
for i in (0..bits_per_char).rev() {
if (value & (1 << i)) != 0 {
let byte_pos: usize = current_bit / 8;
let bit_pos: usize = current_bit % 8;
bytes[byte_pos] |= 1 << (7 - bit_pos);
}
current_bit += 1;
}
}
if byte_length * 8 >= total_bits + bits_per_char as usize {
bytes[0] |= 0x80;
}
Ok(bytes)
}
pub fn encode_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
self.encode_generic(input, 5)
}
pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result<Vec<u8>, Error> {
self.encode_generic(input, 6)
}
pub fn encode_first_to_lower_special(&self, input: &str) -> Result<Vec<u8>, Error> {
let mut chars: Vec<char> = input.chars().collect();
chars[0] = chars[0].to_lowercase().next().unwrap();
self.encode_generic(&chars.iter().collect::<String>(), 5)
}
pub fn encode_all_to_lower_special(
&self,
input: &str,
upper_count: usize,
) -> Result<Vec<u8>, Error> {
let mut new_chars = Vec::with_capacity(input.len() + upper_count);
for c in input.chars() {
if c.is_uppercase() {
new_chars.push('|');
new_chars.push(c.to_lowercase().next().unwrap());
} else {
new_chars.push(c);
}
}
self.encode_generic(&new_chars.iter().collect::<String>(), 5)
}
fn char_to_value(&self, c: char, bits_per_char: u8) -> Result<u8, Error> {
match bits_per_char {
5 => match c {
'a'..='z' => Ok(c as u8 - b'a'),
'.' => Ok(26),
'_' => Ok(27),
'$' => Ok(28),
'|' => Ok(29),
_ => Err(anyhow!(
"Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {c}"
))?,
},
6 => match c {
'a'..='z' => Ok(c as u8 - b'a'),
'A'..='Z' => Ok(c as u8 - b'A' + 26),
'0'..='9' => Ok(c as u8 - b'0' + 52),
_ => {
if c == '.' {
Ok(62)
} else if c == '_' {
Ok(63)
} else {
Err(anyhow!(
"Invalid character value for LOWER_SPECIAL decoding: {c:?}"
))?
}
}
},
_ => unreachable!(),
}
}
}
impl MetaStringDecoder {
pub fn new() -> Self {
MetaStringDecoder {}
}
pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result<String, Error> {
if encoded_data.is_empty() {
return Ok("".to_string());
}
match encoding {
Encoding::LowerSpecial => self.decode_lower_special(encoded_data),
Encoding::LowerUpperDigitSpecial => self.decode_lower_upper_digit_special(encoded_data),
Encoding::FirstToLowerSpecial => self.decode_rep_first_lower_special(encoded_data),
Encoding::AllToLowerSpecial => self.decode_rep_all_to_lower_special(encoded_data),
Encoding::Utf8 => Ok(String::from_utf8_lossy(encoded_data).into_owned()),
}
}
fn decode_lower_special(&self, data: &[u8]) -> Result<String, Error> {
let mut decoded = String::new();
let total_bits: usize = data.len() * 8;
let strip_last_char = (data[0] & 0x80) != 0;
let bit_mask: usize = 0b11111;
let mut bit_index = 1;
while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits))
{
let byte_index = bit_index / 8;
let intra_byte_index = bit_index % 8;
let char_value: usize = if intra_byte_index > 3 {
((((data[byte_index] as usize) << 8)
| if byte_index + 1 < data.len() {
data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
} else {
0
})
>> (11 - intra_byte_index))
& bit_mask
} else {
((data[byte_index] as usize) >> (3 - intra_byte_index)) & bit_mask
};
bit_index += 5;
decoded.push(self.decode_lower_special_char(char_value as u8)?);
}
Ok(decoded)
}
fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result<String, Error> {
let mut decoded = String::new();
let num_bits = data.len() * 8;
let strip_last_char = (data[0] & 0x80) != 0;
let mut bit_index = 1;
let bit_mask: usize = 0b111111;
while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) {
let byte_index = bit_index / 8;
let intra_byte_index = bit_index % 8;
let char_value: usize = if intra_byte_index > 2 {
((((data[byte_index] as usize) << 8)
| if byte_index + 1 < data.len() {
data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF
} else {
0
})
>> (10 - intra_byte_index))
& bit_mask
} else {
((data[byte_index] as usize) >> (2 - intra_byte_index)) & bit_mask
};
bit_index += 6;
decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?);
}
Ok(decoded)
}
fn decode_lower_special_char(&self, char_value: u8) -> Result<char, Error> {
match char_value {
0..=25 => Ok((b'a' + char_value) as char),
26 => Ok('.'),
27 => Ok('_'),
28 => Ok('$'),
29 => Ok('|'),
_ => Err(anyhow!(
"Invalid character value for LOWER_SPECIAL decoding: {char_value}"
))?,
}
}
fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result<char, Error> {
match char_value {
0..=25 => Ok((b'a' + char_value) as char),
26..=51 => Ok((b'A' + char_value - 26) as char),
52..=61 => Ok((b'0' + char_value - 52) as char),
62 => Ok('.'),
63 => Ok('_'),
_ => Err(anyhow!(
"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {char_value}"
))?,
}
}
fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result<String, Error> {
let decoded_str = self.decode_lower_special(data)?;
let mut chars = decoded_str.chars();
match chars.next() {
Some(first_char) => {
let mut result = first_char.to_ascii_uppercase().to_string();
result.extend(chars);
Ok(result)
}
None => Ok(decoded_str),
}
}
fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result<String, Error> {
let decoded_str = self.decode_lower_special(data)?;
let mut result = String::new();
let mut skip = false;
for (i, char) in decoded_str.chars().enumerate() {
if skip {
skip = false;
continue;
}
// Encounter a '|', capitalize the next character
// and skip the following character.
if char == '|' {
if let Some(next_char) = decoded_str.chars().nth(i + 1) {
result.push(next_char.to_ascii_uppercase());
}
skip = true;
} else {
result.push(char);
}
}
Ok(result)
}
}