backends/candle/src/alibi.rs (47 lines of code) (raw):
// coding=utf-8
// Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
// Copyright (c) 2023 Jina AI GmbH. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use candle::{DType, Device, Result, Tensor};
fn get_slopes_power_of_2(n: usize) -> Vec<f64> {
let start: f64 = 2_f64.powf(-(2_f64.powf(-((n as f64).log2() - 3_f64))));
(0..n).map(|i| start * start.powi(i as i32)).collect()
}
pub fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {
if (num_attention_heads as f64).log2().fract() == 0.0 {
// `num_attention_heads` is a power of 2
get_slopes_power_of_2(num_attention_heads)
} else {
let closest_power_of_2 =
2_f64.powi((num_attention_heads as f64).log2().floor() as i32) as usize;
let mut slopes = get_slopes_power_of_2(closest_power_of_2);
let additional_slopes: Vec<f64> = get_slopes_power_of_2(2 * closest_power_of_2)
.into_iter()
.enumerate()
// Filter odd indices
.filter(|(i, _)| i % 2 == 0)
// Remove i
.map(|(_, v)| v)
.collect();
// Extend slopes
slopes.extend_from_slice(&additional_slopes[0..(num_attention_heads - closest_power_of_2)]);
slopes
}
}
pub fn build_alibi_tensor(
num_positions: usize,
num_heads: usize,
device: &Device,
dtype: DType,
) -> Result<Tensor> {
let context_positions =
Tensor::arange(0.0, num_positions as f64, &Device::Cpu)?.unsqueeze(1)?;
let memory_positions = Tensor::arange(0.0, num_positions as f64, &Device::Cpu)?.unsqueeze(0)?;
let relative_positions = memory_positions.broadcast_sub(&context_positions)?.abs()?;
// [num_heads, num_positions, num_positions]
let relative_positions =
relative_positions
.unsqueeze(0)?
.expand((num_heads, num_positions, num_positions))?;
// [num_heads, 1, 1]
let slopes = (Tensor::from_vec(
alibi_head_slopes(num_heads),
(num_heads, 1, 1),
&Device::Cpu,
)? * -1_f64)?;
// [num_heads, num_positions, num_positions]
let alibi = relative_positions.broadcast_mul(&slopes)?;
alibi
.reshape((1, num_heads, num_positions, num_positions))?
.to_dtype(dtype)?
.to_device(device)
}