From e768d9a329f111990e52b6d82d46546162cde159 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Thu, 1 Aug 2024 13:06:23 +0800 Subject: [PATCH] =?UTF-8?q?refactor(tokenizer):=20=E6=95=B4=E7=90=86=20tok?= =?UTF-8?q?enizer=EF=BC=8C=E6=B7=BB=E5=8A=A0=E6=96=B0=20bpe=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- Cargo.lock | 1 - tokenizer/Cargo.toml | 1 - tokenizer/src/bpe.rs | 71 +++++++++++++++++++------------------- tokenizer/src/lib.rs | 7 ++-- tokenizer/src/new_bpe.rs | 65 ++++++++++++++++++++++++++++++++++ tokenizer/src/vocab_txt.rs | 3 +- 6 files changed, 106 insertions(+), 42 deletions(-) create mode 100644 tokenizer/src/new_bpe.rs diff --git a/Cargo.lock b/Cargo.lock index 3d9c50fd..0f65cf6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1727,7 +1727,6 @@ dependencies = [ name = "tokenizer" version = "0.0.0" dependencies = [ - "common 0.0.0", "memmap2", "patricia_tree", ] diff --git a/tokenizer/Cargo.toml b/tokenizer/Cargo.toml index 8d421151..8e21281b 100644 --- a/tokenizer/Cargo.toml +++ b/tokenizer/Cargo.toml @@ -7,6 +7,5 @@ authors = ["YdrMaster "] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -common = { path = "../common" } memmap2.workspace = true patricia_tree = "0.8" diff --git a/tokenizer/src/bpe.rs b/tokenizer/src/bpe.rs index e7292275..56e3bd4a 100644 --- a/tokenizer/src/bpe.rs +++ b/tokenizer/src/bpe.rs @@ -1,5 +1,4 @@ -use crate::{decode_with_ascii, Tokenizer}; -use common::utok; +use crate::{decode_with_ascii, utok, Tokenizer}; use std::{io::Result, path::Path}; /// 由 tokenizer.model 文件定义的 bpe 分词器。 @@ -131,37 +130,37 @@ impl Tokenizer for BPE { } } -#[test] -fn read_tokenizer() { - let Some(model_dir) = common::test_model::find() else { - return; - }; - println!("model_dir: {}", model_dir.display()); - - if let Ok(bpe) = BPE::from_model_file(model_dir.join("tokenizer.model")) { - for i in 0..bpe.offsets.len() { - println!("{}: {}", bpe.get_piece(i as utok), bpe.get_score(i as utok)); - } - } -} - -#[test] -fn once_upon_a_time() { - let Some(model_dir) = common::test_model::find() else { - return; - }; - println!("model_dir: {}", model_dir.display()); - - use std::time::Instant; - if let Ok(bpe) = BPE::from_model_file(model_dir.join("tokenizer.model")) { - const PROMPT: &str = "▁Once▁upon▁a▁time,"; - let tokens = bpe.encode(PROMPT); - let t0 = Instant::now(); - for _ in 0..10000 { - let _tokens = bpe.encode(PROMPT); - } - let t1 = Instant::now(); - println!("{:?}", t1 - t0); - assert_eq!(tokens, &[9038, 2501, 263, 931, 29892]); - } -} +// #[test] +// fn read_tokenizer() { +// let Some(model_dir) = common::test_model::find() else { +// return; +// }; +// println!("model_dir: {}", model_dir.display()); + +// if let Ok(bpe) = BPE::from_model_file(model_dir.join("tokenizer.model")) { +// for i in 0..bpe.offsets.len() { +// println!("{}: {}", bpe.get_piece(i as utok), bpe.get_score(i as utok)); +// } +// } +// } + +// #[test] +// fn once_upon_a_time() { +// let Some(model_dir) = common::test_model::find() else { +// return; +// }; +// println!("model_dir: {}", model_dir.display()); + +// use std::time::Instant; +// if let Ok(bpe) = BPE::from_model_file(model_dir.join("tokenizer.model")) { +// const PROMPT: &str = "▁Once▁upon▁a▁time,"; +// let tokens = bpe.encode(PROMPT); +// let t0 = Instant::now(); +// for _ in 0..10000 { +// let _tokens = bpe.encode(PROMPT); +// } +// let t1 = Instant::now(); +// println!("{:?}", t1 - t0); +// assert_eq!(tokens, &[9038, 2501, 263, 931, 29892]); +// } +// } diff --git a/tokenizer/src/lib.rs b/tokenizer/src/lib.rs index 9f6d9baf..74ff9a9a 100644 --- a/tokenizer/src/lib.rs +++ b/tokenizer/src/lib.rs @@ -1,8 +1,11 @@ mod bpe; +mod new_bpe; mod normalizer; mod vocab_txt; -use common::utok; +/// `utok` for token id. +#[allow(non_camel_case_types)] +pub type utok = u32; pub trait Tokenizer { fn vocab_size(&self) -> usize; @@ -14,7 +17,7 @@ pub use bpe::BPE; pub use normalizer::{BPECommonNormalizer, Normalizer}; pub use vocab_txt::VocabTxt; -const fn decode_with_ascii<'a>(piece: &'a str) -> &'a str { +const fn decode_with_ascii(piece: &str) -> &str { // 预填充 ASCII 码表的所有字符 const BYTES: [u8; 256] = { let mut ans = [0; 256]; diff --git a/tokenizer/src/new_bpe.rs b/tokenizer/src/new_bpe.rs new file mode 100644 index 00000000..a6b9d8cf --- /dev/null +++ b/tokenizer/src/new_bpe.rs @@ -0,0 +1,65 @@ +#![allow(unused)] + +use crate::utok; +use std::{pin::Pin, ptr::NonNull}; + +pub struct NewBpe { + vocabs: Pin>, + token_to_piece: Box<[Token]>, + piece_to_token: Box<[utok]>, +} + +struct Token { + ptr: NonNull, + len: u32, + score: f32, +} + +impl AsRef for Token { + #[inline] + fn as_ref(&self) -> &str { + use std::{slice::from_raw_parts, str::from_utf8_unchecked}; + unsafe { from_utf8_unchecked(from_raw_parts(self.ptr.as_ptr(), self.len as _)) } + } +} + +impl NewBpe { + pub fn new<'a>( + vocabs: impl IntoIterator, + scores: impl Iterator, + vocab_size_hint: usize, + ) -> Self { + let mut text_buf = Vec::with_capacity(vocab_size_hint * 4); + let mut token_to_piece = Vec::<(usize, usize)>::with_capacity(vocab_size_hint); + + for vocab in vocabs.into_iter() { + let vocab = vocab.as_bytes(); + let off = text_buf.len(); + let len = vocab.len(); + text_buf.extend_from_slice(vocab); + token_to_piece.push((off, len)); + } + let vocab_size = token_to_piece.len(); + + let vocabs = unsafe { Pin::new_unchecked(text_buf.into_boxed_slice()) }; + let token_to_piece = token_to_piece + .into_iter() + .zip(scores) + .map(|((off, len), score)| Token { + ptr: unsafe { NonNull::new_unchecked(vocabs.as_ptr().add(off).cast_mut()) }, + len: len as _, + score, + }) + .collect::>(); + assert_eq!(token_to_piece.len(), vocab_size); + + let mut piece_to_token = (0..vocab_size as utok).collect::>(); + piece_to_token.sort_by_key(|&i| token_to_piece[i as usize].as_ref()); + + Self { + vocabs, + token_to_piece, + piece_to_token, + } + } +} diff --git a/tokenizer/src/vocab_txt.rs b/tokenizer/src/vocab_txt.rs index 02bbb22e..6c609fe1 100644 --- a/tokenizer/src/vocab_txt.rs +++ b/tokenizer/src/vocab_txt.rs @@ -1,5 +1,4 @@ -use crate::{decode_with_ascii, Tokenizer}; -use common::utok; +use crate::{decode_with_ascii, utok, Tokenizer}; use memmap2::Mmap; use patricia_tree::PatriciaMap; use std::{fs::File, io::Result, path::Path};