Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Support for mixed quantized BitNet Architecture Inference #2683

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ serde = { version = "1.0.171", features = ["derive"] }
serde_plain = "1.0.2"
serde_json = "1.0.99"
thiserror = "1"
tokenizers = { version = "0.19.1", default-features = false }
tokenizers = { version = "0.21.0", default-features = false }
tracing = "0.1.37"
tracing-chrome = "0.7.1"
tracing-subscriber = "0.3.7"
Expand Down
12 changes: 12 additions & 0 deletions candle-core/src/quantized/ggml_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,18 @@ pub fn qtensor_from_ggml(
GgmlDType::Q6K => {
from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
}
GgmlDType::Q8K => {
from_raw_data::<k_quants::BlockQ8K>(raw_data, size_in_bytes, dims, device)
}
GgmlDType::Q2b0 => {
from_raw_data::<k_quants::BlockQ2b0>(raw_data, size_in_bytes, dims, device)
}
GgmlDType::QI8 => {
from_raw_data::<k_quants::BlockQI8>(raw_data, size_in_bytes, dims, device)
}
GgmlDType::Q2b1 => {
from_raw_data::<k_quants::BlockQ2b1>(raw_data, size_in_bytes, dims, device)
}
_ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
}
}
Expand Down
22 changes: 21 additions & 1 deletion candle-core/src/quantized/gguf_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,26 @@ impl Value {
}
}

pub fn from_u8(v: u8) -> Self {
Self::U8(v)
}

pub fn from_u64(v: u64) -> Self {
Self::U64(v)
}

pub fn from_u32(v: u32) -> Self {
Self::U32(v)
}

pub fn from_f32(v: f32) -> Self {
Self::F32(v)
}

pub fn from_string(v: String) -> Self {
Self::String(v)
}

pub fn to_u8(&self) -> Result<u8> {
match self {
Self::U8(v) => Ok(*v),
Expand Down Expand Up @@ -489,7 +509,7 @@ fn write_string<W: std::io::Write>(w: &mut W, str: &str) -> Result<()> {

pub fn write<W: std::io::Seek + std::io::Write>(
w: &mut W,
metadata: &[(&str, &Value)],
metadata: &[(&str, Value)],
tensors: &[(&str, &QTensor)],
) -> Result<()> {
w.write_u32::<LittleEndian>(0x46554747)?;
Expand Down
Loading