in candle-core/src/quantized/k_quants.rs [1643:1713]
fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
if xs.len() != ys.len() * Self::BLCK_SIZE {
crate::bail!(
"quantize_row_q6k: size mismatch {} {} {}",
xs.len(),
ys.len(),
Self::BLCK_SIZE
)
}
let mut l = [0i8; QK_K];
let mut scales = [0f32; QK_K / 16];
let mut x = xs.as_ptr();
let l = l.as_mut_ptr();
unsafe {
for y in ys.iter_mut() {
let mut max_scale = 0f32;
let mut max_abs_scale = 0f32;
for (ib, scale_) in scales.iter_mut().enumerate() {
let scale = make_qx_quants(16, 32, x.add(16 * ib), l.add(16 * ib), 1);
*scale_ = scale;
let abs_scale = scale.abs();
if abs_scale > max_abs_scale {
max_abs_scale = abs_scale;
max_scale = scale
}
}
let iscale = -128f32 / max_scale;
y.d = f16::from_f32(1.0 / iscale);
for (y_scale, scale) in y.scales.iter_mut().zip(scales.iter()) {
*y_scale = nearest_int(iscale * scale).min(127) as i8
}
for (j, &y_scale) in y.scales.iter().enumerate() {
let d = y.d.to_f32() * y_scale as f32;
if d == 0. {
continue;
}
for ii in 0..16 {
let ll = nearest_int(*x.add(16 * j + ii) / d).clamp(-32, 31);
*l.add(16 * j + ii) = (ll + 32) as i8
}
}
let mut ql = y.ql.as_mut_ptr();
let mut qh = y.qh.as_mut_ptr();
for j in (0..QK_K).step_by(128) {
for l_idx in 0..32 {
let q1 = *l.add(j + l_idx) & 0xF;
let q2 = *l.add(j + l_idx + 32) & 0xF;
let q3 = *l.add(j + l_idx + 64) & 0xF;
let q4 = *l.add(j + l_idx + 96) & 0xF;
*ql.add(l_idx) = (q1 | (q3 << 4)) as u8;
*ql.add(l_idx + 32) = (q2 | (q4 << 4)) as u8;
*qh.add(l_idx) = ((*l.add(j + l_idx) >> 4)
| ((*l.add(j + l_idx + 32) >> 4) << 2)
| ((*l.add(j + l_idx + 64) >> 4) << 4)
| ((*l.add(j + l_idx + 96) >> 4) << 6))
as u8;
}
ql = ql.add(64);
qh = qh.add(32);
}
x = x.add(QK_K)
}
}
Ok(())
}