huggingface · LaurentMazare · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/candle-transformers/src/models/based.rs b/candle-transformers/src/models/based.rs
@@ -1,10 +1,9 @@
 //! Based from the Stanford Hazy Research group.
 //!
 //! See "Simple linear attention language models balance the recall-throughput tradeoff", Arora et al. 2024
-//! <https://arxiv.org/abs/2402.18668>
-
-//! Original code:
-//! https://github.com/HazyResearch/based
+//! - [Arxiv](https://arxiv.org/abs/2402.18668)
+//! - [Github](https://github.com/HazyResearch/based)
+//!
 
 use candle::{DType, Device, IndexOp, Module, Result, Tensor, D};
 use candle_nn::{

diff --git a/candle-transformers/src/models/beit.rs b/candle-transformers/src/models/beit.rs
@@ -1,3 +1,10 @@
+//! Based on the BEIT vision-language model.
+//!
+//! See "BEIT: BERT Pre-Training of Image Transformers", Bao et al. 2021
+//! - [Arxiv](https://arxiv.org/abs/2106.08254)
+//! - [Github](https://github.com/microsoft/unilm/tree/master/beit)
+//!
+
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, Linear, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/bert.rs b/candle-transformers/src/models/bert.rs
@@ -1,3 +1,9 @@
+//! BERT (Bidirectional Encoder Representations from Transformers)
+//!
+//! See "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", Devlin et al. 2018
+//! - [Arxiv](https://arxiv.org/abs/1810.04805)
+//! - [Github](https://github.com/google-research/bert)
+//!
 use super::with_tracing::{layer_norm, linear, LayerNorm, Linear};
 use candle::{DType, Device, Result, Tensor};
 use candle_nn::{embedding, Embedding, Module, VarBuilder};

diff --git a/candle-transformers/src/models/bigcode.rs b/candle-transformers/src/models/bigcode.rs
@@ -1,3 +1,10 @@
+//! BigCode implementation in Rust based on the GPT-BigCode model.
+//!
+//! See "StarCoder: A State-of-the-Art LLM for Code", Mukherjee et al. 2023
+//! - [Arxiv](https://arxiv.org/abs/2305.06161)
+//! - [Github](https://github.com/bigcode-project/starcoder)
+//!
+
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{embedding, linear_b as linear, Embedding, LayerNorm, Linear, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/blip.rs b/candle-transformers/src/models/blip.rs
@@ -1,3 +1,10 @@
+//! Based on the BLIP paper from Salesforce Research.
+//!
+//! See "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation"
+//! - [Arxiv](https://arxiv.org/abs/2201.12086)
+//! - [Github](https://github.com/salesforce/BLIP)
+//!
+
 use super::blip_text;
 use super::with_tracing::{conv2d, linear, Conv2d, Linear};
 use candle::{Module, Result, Tensor, D};

diff --git a/candle-transformers/src/models/blip_text.rs b/candle-transformers/src/models/blip_text.rs
@@ -1,3 +1,9 @@
+//! Implementation of BLIP text encoder/decoder.
+//!
+//! See "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation"
+//! https://arxiv.org/abs/2201.12086
+//!
+
 use super::with_tracing::{linear, Embedding, Linear};
 use candle::{Module, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, VarBuilder};

diff --git a/candle-transformers/src/models/chatglm.rs b/candle-transformers/src/models/chatglm.rs
@@ -1,3 +1,10 @@
+//! Implementation of the ChatGLM2/3 models from THUDM.
+//!
+//! See:
+//! - ChatGLM3: ["ChatGLM3: Advancing Multilingual Conversational Language Models with High-Quality Data"](https://github.com/THUDM/ChatGLM3)
+//! - ChatGLM2: ["ChatGLM2: An Open Bilingual Chat LLM"](https://github.com/THUDM/ChatGLM2-6B)
+//!
+
 use crate::models::with_tracing::{linear_b as linear, Linear};
 use candle::{DType, Device, IndexOp, Module, Result, Tensor, D};
 use candle_nn::VarBuilder;

diff --git a/candle-transformers/src/models/chinese_clip/mod.rs b/candle-transformers/src/models/chinese_clip/mod.rs
@@ -3,8 +3,9 @@
 //! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
 //! pairs of images with related texts.
 //!
-//! https://github.com/OFA-Sys/Chinese-CLIP
-//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+//! - [GH Link](https://github.com/OFA-Sys/Chinese-CLIP)
+//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py)
+//!
 
 use candle::{Module, Result, Tensor, D};
 use candle_nn as nn;

diff --git a/candle-transformers/src/models/clip/mod.rs b/candle-transformers/src/models/clip/mod.rs
@@ -3,8 +3,9 @@
 //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
 //! pairs of images with related texts.
 //!
-//! https://github.com/openai/CLIP
-//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip
+//! - [GH Link](https://github.com/openai/CLIP)
+//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip)
+
 use self::{
     text_model::{Activation, ClipTextTransformer},
     vision_model::ClipVisionTransformer,

diff --git a/candle-transformers/src/models/codegeex4_9b.rs b/candle-transformers/src/models/codegeex4_9b.rs
@@ -1,3 +1,10 @@
+//! CodeGeeX4 - A multi-language code generation model
+//!
+//! See "CodeGeeX: A Pre-Trained Model For Code Generation with Multilingual Evaluations on HumanEval-X", Qian et al. 2023
+//! - [Arxiv](https://arxiv.org/abs/2303.17568)
+//! - [Github](https://github.com/THUDM/CodeGeeX)
+//!
+
 use crate::models::with_tracing::{linear_b as linear, Linear};
 use candle::{DType, Device, IndexOp, Module, Result, Tensor, D};
 use candle_nn::VarBuilder;

diff --git a/candle-transformers/src/models/colpali.rs b/candle-transformers/src/models/colpali.rs
@@ -1,3 +1,8 @@
+//! Colpali Model for text/image similarity scoring.
+//!
+//! Colpali combines a vision encoder with an efficient LM for retrieving content.
+//!
+
 use candle::{Module, Result, Tensor};
 use candle_nn::VarBuilder;
 

diff --git a/candle-transformers/src/models/convmixer.rs b/candle-transformers/src/models/convmixer.rs
@@ -1,3 +1,10 @@
+//! ConvMixer implementation.
+//!
+//! See "Patches Are All You Need?" by Trockman et al. 2022
+//! - [Arxiv](https://arxiv.org/abs/2201.09792)
+//! - [Github](https://github.com/locuslab/convmixer)
+//!
+
 use candle::Result;
 use candle_nn::{batch_norm, Conv2dConfig, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/convnext.rs b/candle-transformers/src/models/convnext.rs
@@ -1,15 +1,13 @@
 //! ConvNeXt implementation.
 //!
-//! See "A ConvNet for the 2020s" Liu et al. 2022
-//! <https://arxiv.org/abs/2201.03545>
+//! See ["A ConvNet for the 2020s" Liu et al. 2022](https://arxiv.org/abs/2201.03545)
 //! and
-//! "ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders" Woo et al. 2023
-//! <https://arxiv.org/abs/2301.00808>
-
+//! ["ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders" Woo et al. 2023](https://arxiv.org/abs/2301.00808)
+//!
 //! Original code:
-//! https://github.com/facebookresearch/ConvNeXt/
-//! https://github.com/facebookresearch/ConvNeXt-V2/
-//! timm: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py
+//!  - [ConvNeXt](https://github.com/facebookresearch/ConvNeXt/)
+//!  - [ConvNeXt-V2](https://github.com/facebookresearch/ConvNeXt-V2/)
+//!  - [timm](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py)
 
 use candle::shape::ShapeWithOneHole;
 use candle::{Result, D};

diff --git a/candle-transformers/src/models/dac.rs b/candle-transformers/src/models/dac.rs
@@ -1,4 +1,9 @@
-/// Adapted from https://github.com/descriptinc/descript-audio-codec
+//! Implementation of the Descript Audio Codec (DAC) model
+//!
+//! See: [Descript Audio Codec](https://github.com/descriptinc/descript-audio-codec)
+//!
+/// An efficient neural codec for compressing/decompressing audio
+///
 use crate::models::encodec;
 use candle::{IndexOp, Result, Tensor, D};
 use candle_nn::{Conv1d, Conv1dConfig, ConvTranspose1d, ConvTranspose1dConfig, VarBuilder};

diff --git a/candle-transformers/src/models/depth_anything_v2.rs b/candle-transformers/src/models/depth_anything_v2.rs
@@ -1,3 +1,9 @@
+//! Implementation of the Depth Anything model from FAIR.
+//!
+//! See:
+//! - ["Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data"](https://github.com/LiheYoung/Depth-Anything)
+//!
+
 use candle::D::Minus1;
 use candle::{Module, Result, Tensor};
 use candle_nn::ops::Identity;

diff --git a/candle-transformers/src/models/dinov2.rs b/candle-transformers/src/models/dinov2.rs
@@ -1,3 +1,8 @@
+//! Implementation of the DINOv2 models from Meta Research.
+//!
+//! See:
+//! - DINOv2: ["DINOv2: Learning Robust Visual Features without Supervision"](https://github.com/facebookresearch/dinov2)
+//!
 use candle::{IndexOp, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, Linear, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/dinov2reg4.rs b/candle-transformers/src/models/dinov2reg4.rs
@@ -1,3 +1,10 @@
+//! Implementation of the DINOv2 revision (4 regularization)
+//!
+//! See:
+//! - DINOv2: ["DINOv2: Learning Robust Visual Features without Supervision"](https://github.com/facebookresearch/dinov2)
+//!
+//! This code implements the regularization tokens version with 4 regularization tokens.
+//!
 use candle::{IndexOp, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, Linear, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/distilbert.rs b/candle-transformers/src/models/distilbert.rs
@@ -1,3 +1,8 @@
+//! Implementation of DistilBert, a distilled version of BERT.
+//!
+//! See:
+//! - ["DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter"](https://arxiv.org/abs/1910.01108)
+//!
 use super::with_tracing::{layer_norm, linear, LayerNorm, Linear};
 use candle::{DType, Device, Result, Tensor};
 use candle_nn::{Embedding, Module, VarBuilder};

diff --git a/candle-transformers/src/models/efficientnet.rs b/candle-transformers/src/models/efficientnet.rs
@@ -1,3 +1,8 @@
+//! Implementation of EfficientBert, an efficient variant of BERT for computer vision tasks.
+//!
+//! See:
+//! - ["EfficientBERT: Progressively Searching Multilayer Perceptron Architectures for BERT"](https://arxiv.org/abs/2201.00462)
+//!
 use candle::{Result, Tensor, D};
 use candle_nn as nn;
 use nn::{Module, VarBuilder};

diff --git a/candle-transformers/src/models/efficientvit.rs b/candle-transformers/src/models/efficientvit.rs
@@ -1,9 +1,8 @@
 //! EfficientViT (MSRA) inference implementation based on timm.
 //!
-//! See "EfﬁcientViT: Memory Efﬁcient Vision Transformer with Cascaded Group Attention"
-//! https://arxiv.org/abs/2305.07027
-
-//! https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/efficientvit_msra.py
+//! See ["EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention"](https://arxiv.org/abs/2305.07027)
+//!
+//! Based on implementation from [pytorch-image-models](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/efficientvit_msra.py)
 
 use candle::{Result, Tensor, D};
 use candle_nn::{

diff --git a/candle-transformers/src/models/encodec.rs b/candle-transformers/src/models/encodec.rs
@@ -1,3 +1,9 @@
+//! EnCodec neural audio codec based on the Encodec implementation.
+//!
+//! See ["High Fidelity Neural Audio Compression"](https://arxiv.org/abs/2210.13438)
+//!
+//! Based on implementation from [huggingface/transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/encodec/modeling_encodec.py)
+
 #![allow(unused)]
 use candle::{DType, IndexOp, Layout, Module, Result, Shape, Tensor, D};
 use candle_nn::{conv1d, Conv1d, Conv1dConfig, ConvTranspose1d, VarBuilder};

diff --git a/candle-transformers/src/models/eva2.rs b/candle-transformers/src/models/eva2.rs
@@ -1,3 +1,9 @@
+//! EVA-2 inference implementation.
+//!
+//! See ["EVA-02: A Visual Representation for Neon Genesis"](https://arxiv.org/abs/2303.11331)
+//!
+//! Based on implementation from [pytorch-image-models](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/eva2.py)
+
 use candle::{IndexOp, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, Linear, Module, VarBuilder};
 

diff --git a/candle-transformers/src/models/falcon.rs b/candle-transformers/src/models/falcon.rs
@@ -1,3 +1,9 @@
+//! Falcon language model inference implementation
+//!
+//! See ["Falcon: a new approach to large language models"](https://huggingface.co/blog/falcon)
+//!
+//! Based on implementation from [Huggingface Transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon)
+
 use candle::{DType, Device, Result, Tensor, D};
 use candle_nn::{embedding, linear_b as linear, Embedding, LayerNorm, Linear, Module, VarBuilder};
 use serde::Deserialize;

diff --git a/candle-transformers/src/models/fastvit.rs b/candle-transformers/src/models/fastvit.rs
@@ -1,9 +1,9 @@
-//! FastViT inference implementation based on timm
+//! # FastViT inference implementation based on timm
 //!
-//! See "FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization"
-//! https://arxiv.org/pdf/2303.14189
+//! ## Description
+//! See ["FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization"](https://arxiv.org/pdf/2303.14189)
 //!
-//! https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/fastvit.py
+//! Implementation based on [timm model](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/fastvit.py)
 
 use candle::{DType, Result, Tensor, D};
 use candle_nn::{

diff --git a/candle-transformers/src/models/flux/mod.rs b/candle-transformers/src/models/flux/mod.rs
@@ -1,3 +1,10 @@
+//! Flux  Model
+//!
+//! Flux is a series of text-to-image generation models based on diffusion transformers.
+//!
+//! - [GH Link](https://github.com/black-forest-labs/flux)
+//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py)
+//!
 use candle::{Result, Tensor};
 
 pub trait WithForward {

diff --git a/candle-transformers/src/models/gemma.rs b/candle-transformers/src/models/gemma.rs
@@ -1,3 +1,9 @@
+//! Gemma inference implementation.
+//!
+//! See ["Gemma: Open Models Based on Gemini Technology"](https://blog.google/technology/developers/gemma-open-ai-model/)
+//!
+//! Based on implementation from Google and PyTorch
+
 use std::sync::Arc;
 
 use candle::{DType, Device, Module, Result, Tensor, D};

diff --git a/candle-transformers/src/models/gemma2.rs b/candle-transformers/src/models/gemma2.rs
@@ -1,3 +1,9 @@
+//! Gemma LLM architecture (Google) inference implementation.
+//!
+//! See ["Gemma: Open Models Based on Gemini Technology"](https://blog.google/technology/developers/gemma-open-models/)
+//!
+//! Based on implementations from Google and OpenLLM
+
 use std::sync::Arc;
 
 use candle::{DType, Device, Module, Result, Tensor, D};

diff --git a/candle-transformers/src/models/glm4.rs b/candle-transformers/src/models/glm4.rs
@@ -1,3 +1,9 @@
+//! GLM-4 inference implementation.
+//!
+//! An open bilingual language model with 130B parameters.
+//!
+//! Based on implementation from [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B)
+
 use crate::models::with_tracing::{linear_b as linear, Linear};
 use candle::{DType, Device, IndexOp, Module, Result, Tensor, D};
 use candle_nn::VarBuilder;

diff --git a/candle-transformers/src/models/granite.rs b/candle-transformers/src/models/granite.rs
@@ -1,3 +1,10 @@
+//! Granite is a Long Context Transformer Language Model.
+//!
+//! A high performance transformer model optimized for efficient processing
+//! of very long context sequences
+//!
+//! Based on implementation from [Nod.ai](https://github.com/nod-ai/granite)
+
 use super::with_tracing::{linear_no_bias as linear, Linear, RmsNorm};
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{embedding, Embedding, Module, VarBuilder};

diff --git a/candle-transformers/src/models/hiera.rs b/candle-transformers/src/models/hiera.rs
@@ -1,9 +1,9 @@
-//! Hiera inference implementation based on timm.
+//! [Hiera] inference implementation based on timm.
 //!
-//! See "Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles"
-//! https://arxiv.org/abs/2306.00989
+//! See "[Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles]"
+//! [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles]: https://arxiv.org/abs/2306.00989
 //!
-//! https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/hiera.py
+//! [Hiera]: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/hiera.py
 
 use candle::{Result, D};
 use candle_nn::{conv2d, layer_norm, linear, ops::softmax, Conv2dConfig, Func, VarBuilder};

diff --git a/candle-transformers/src/models/jina_bert.rs b/candle-transformers/src/models/jina_bert.rs
@@ -1,3 +1,9 @@
+//! # JinaBERT inference implementation
+//!
+//! Based on implementation from huggingface for Jina BERT and its variants
+//!
+//! See: [Jina Embeddings on HuggingFace](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
+
 use super::with_tracing::{linear, linear_no_bias, Embedding, Linear};
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{layer_norm, LayerNorm, Module, VarBuilder};

diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs
@@ -1,3 +1,9 @@
+//! Llama inference implementation.
+//!
+//! See ["LLaMA: Open and Efficient Foundation Language Models"](https://arxiv.org/abs/2302.13971)
+//!
+//! Implementation based on Hugging Face's [transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py)
+
 use super::with_tracing::{linear_no_bias as linear, Linear, RmsNorm};
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{embedding, Embedding, Module, VarBuilder};

diff --git a/candle-transformers/src/models/llama2_c.rs b/candle-transformers/src/models/llama2_c.rs
@@ -1,3 +1,9 @@
+//! Llama2 inference implementation.
+//!
+//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://arxiv.org/abs/2307.09288)
+//!
+//! Based on the [llama2.c](https://github.com/karpathy/llama2.c) implementation
+
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::linear_no_bias as linear;
 use candle_nn::{embedding, rms_norm, Embedding, Linear, Module, RmsNorm, VarBuilder};

diff --git a/candle-transformers/src/models/llama2_c_weights.rs b/candle-transformers/src/models/llama2_c_weights.rs
@@ -1,3 +1,9 @@
+//! Llama2 inference implementation.
+//!
+//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://arxiv.org/abs/2307.09288)
+//!
+//! Based on the [llama2.c](https://github.com/karpathy/llama2.c) implementation
+
 use byteorder::{LittleEndian, ReadBytesExt};
 use candle::{DType, Device, IndexOp, Result, Shape, Tensor};
 use candle_nn::VarBuilder;

diff --git a/candle-transformers/src/models/llava/mod.rs b/candle-transformers/src/models/llava/mod.rs
@@ -1,3 +1,13 @@
+//! The LLaVA (Large Language and Vision Assistant) model.
+//!
+//! This provides the main model implementation combining a vision tower (CLIP) with
+//! language model (Llama) for multimodal capabilities.
+//!
+//! The architecture implements the training-free projection technique from the paper:
+//! [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485).
+//!
+//! - [GH Link](https://github.com/haotian-liu/LLaVA/tree/main)
+//!
 pub mod config;
 pub mod utils;