Merge pull request #12 from vinlemon/feat-face-landmark

Add face landmark task
WasmEdge · Oct 2, 2024 · 8afdca2 · 8afdca2
2 parents 68462d2 + 3605103
commit 8afdca2
Show file tree

Hide file tree

Showing 14 changed files with 3,797 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@
 * [x] Hand Landmark Detection
 * [x] Image Embedding
 * [x] Face Detection
-* [ ] Face Landmark Detection
+* [x] Face Landmark Detection
 * [ ] Pose Landmark Detection
 * [x] Audio Classification
 * [x] Text Classification
@@ -85,6 +85,8 @@ Every task has three types: ```XxxBuilder```, ```Xxx```, ```XxxSession```. (``Xx
     * image embedding: `ImageEmbedderBuilder` -> `ImageEmbedder` -> `ImageEmbedderSession`
     * image segmentation: `ImageSegmenterBuilder` -> `ImageSegmenter` -> `ImageSegmenterSession`
     * object detection: `ObjectDetectorBuilder` -> `ObjectDetector` -> `ObjectDetectorSession`
+    * face detection: `FaceDetectorBuilder` -> `FaceDetector` -> `FaceDetectorSession`
+    * face landmark detection: `FaceLandmarkerBuilder` -> `FaceLandmarker` -> `FaceLandmarkerSession`
 * audio:
     * audio classification: `AudioClassifierBuilder` -> `AudioClassifier` -> `AudioClassifierSession`
 * text:
@@ -278,6 +280,79 @@ $ cargo run --release --example gesture_recognition -- ./assets/models/gesture_r
       Index:         6
 ```
 
+### Face Landmarks Detection
+
+```rust
+use mediapipe_rs::tasks::vision::FaceLandmarkerBuilder;
+use mediapipe_rs::postprocess::utils::DrawLandmarksOptions;
+use mediapipe_rs::tasks::vision::FaceLandmarkConnections;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let (model_path, img_path, output_path) = parse_args()?;
+
+    let mut input_img = image::open(img_path)?;
+    let face_landmark_results = FaceLandmarkerBuilder::new()
+        .num_faces(1) // set max number of faces to detect
+        .min_face_detection_confidence(0.5)
+        .min_face_presence_confidence(0.5)
+        .min_tracking_confidence(0.5)
+        .output_face_blendshapes(true)
+        .build_from_file(model_path)? // create a face landmarker
+        .detect(&input_img)?; // do inference and generate results
+
+    // show formatted result message
+    println!("{}", face_landmark_results);
+
+    if let Some(output_path) = output_path {
+        // draw face landmarks result to image
+        let options = DrawLandmarksOptions::default()
+            .connections(FaceLandmarkConnections::get_connections(
+                &FaceLandmarkConnections::FacemeshTesselation,
+            ))
+            .landmark_radius_percent(0.003);
+
+        for result in face_landmark_results.iter() {
+            result.draw_with_options(&mut input_img, &options);
+        }
+        // save output image
+        input_img.save(output_path)?;
+    }
+
+    Ok(())
+}
+```
+
+Example input: (The image is downloaded from https://storage.googleapis.com/mediapipe-assets/portrait.jpg)
+
+<img height="30%" src="https://storage.googleapis.com/mediapipe-assets/portrait.jpg" width="30%" alt="face_detection_full_range_image.jpg" />
+
+Example output in console:
+
+```console
+$ cargo run --release --example face_landmark -- ./assets/models/face_landmark/face_landmarker.task ./assets/testdata/img/face.jpg ./assets/doc/face_landmark_output.jpg
+
+    Finished release [optimized] target(s) in 4.50s
+     Running `./scripts/wasmedge-runner.sh target/wasm32-wasi/release/examples/face_landmark.wasm ./assets/models/face_landmark/face_landmarker.task ./assets/testdata/img/face.jpg ./assets/doc/face_landmark_output.jpg`
+
+FaceLandmarkResult #0
+  Landmarks:
+    Normalized Landmark #0:
+      x:       0.49687287
+      y:       0.24964334
+      z:       -0.029807145
+    Normalized Landmark #1:
+      x:       0.49801534
+      y:       0.22689381
+      z:       -0.05928771
+    Normalized Landmark #2:
+      x:       0.49707597
+      y:       0.23421054
+      z:       -0.03364953
+```
+
+Example output image:
+<img height="30%" src="./assets/doc/face_landmark_output.jpg" width="30%"/>
+
 ### Audio Input
 
 Every audio media which implements the trait ```AudioData``` can be used as audio tasks input.

diff --git a/assets/doc/face_landmark_output.jpg b/assets/doc/face_landmark_output.jpg
diff --git a/examples/face_landmark.rs b/examples/face_landmark.rs
@@ -0,0 +1,49 @@
+fn parse_args() -> Result<(String, String, Option<String>), Box<dyn std::error::Error>> {
+    let args: Vec<String> = std::env::args().collect();
+    if args.len() != 3 && args.len() != 4 {
+        return Err(format!(
+            "Usage {} model_path image_path [output image path]",
+            args[0]
+        )
+        .into());
+    }
+    Ok((args[1].clone(), args[2].clone(), args.get(3).cloned()))
+}
+
+use mediapipe_rs::tasks::vision::FaceLandmarkerBuilder;
+use mediapipe_rs::postprocess::utils::DrawLandmarksOptions;
+use mediapipe_rs::tasks::vision::FaceLandmarkConnections;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let (model_path, img_path, output_path) = parse_args()?;
+
+    let mut input_img = image::open(img_path)?;
+    let face_landmark_results = FaceLandmarkerBuilder::new()
+        .num_faces(1) // set max number of faces to detect
+        .min_face_detection_confidence(0.5)
+        .min_face_presence_confidence(0.5)
+        .min_tracking_confidence(0.5)
+        .output_face_blendshapes(true)
+        .build_from_file(model_path)? // create a face landmarker
+        .detect(&input_img)?; // do inference and generate results
+
+    // show formatted result message
+    println!("{}", face_landmark_results);
+
+    if let Some(output_path) = output_path {
+        // draw face landmarks result to image
+        let options = DrawLandmarksOptions::default()
+            .connections(FaceLandmarkConnections::get_connections(
+                &FaceLandmarkConnections::FacemeshTesselation,
+            ))
+            .landmark_radius_percent(0.003);
+
+        for result in face_landmark_results.iter() {
+            result.draw_with_options(&mut input_img, &options);
+        }
+        // save output image
+        input_img.save(output_path)?;
+    }
+
+    Ok(())
+}
diff --git a/scripts/download-models.sh b/scripts/download-models.sh
@@ -39,8 +39,13 @@ image_classification_init() {
     curl -sLO "${url}"
   done
 
-  # for custom model downloaded from tf hub
-  curl -sL "https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3?lite-format=tflite" -o "lite-model_aiy_vision_classifier_birds_V1_3.tflite"
+  # for custom model downloaded from kaggle
+  bird_model_name="lite-model_aiy_vision_classifier_birds_V1_3"
+  kaggle_tflite_filename="3.tflite"
+  curl -sL "https://www.kaggle.com/api/v1/models/google/aiy/tfLite/vision-classifier-birds-v1/3/download" -o "${bird_model_name}.tar.gz"
+  tar -zxvf "${bird_model_name}.tar.gz"
+  mv "${kaggle_tflite_filename}" "${bird_model_name}.tflite"
+  rm -rf "${bird_model_name}.tar.gz"
 
   popd
 }
@@ -126,6 +131,21 @@ face_detection_init() {
     popd
 }
 
+face_landmark_init() {
+    face_landmark_dir="${model_path}/face_landmark"
+    mkdir -p "${face_landmark_dir}"
+    pushd "${face_landmark_dir}"
+
+    model_urls=("https://storage.googleapis.com/mediapipe-tasks/face_landmarker/face_landmarker.task"
+    )
+
+    for url in "${model_urls[@]}"; do
+      curl -sLO "${url}"
+    done
+
+    popd
+}
+
 
 audio_classification_init() {
   audio_classification_dir="${model_path}/audio_classification"
@@ -182,6 +202,7 @@ hand_landmark_detection_init
 image_segmentation_init
 image_embedding_init
 face_detection_init
+face_landmark_init
 audio_classification_init
 text_classification_init
 text_embedding_init
diff --git a/src/postprocess/utils/vision/draw_landmarks.rs b/src/postprocess/utils/vision/draw_landmarks.rs
@@ -51,6 +51,12 @@ impl<'a, P: Pixel> DrawLandmarksOptions<'a, P> {
         self.visibility_threshold = visibility_threshold;
         self
     }
+
+    #[inline(always)]
+    pub fn landmark_radius_percent(mut self, landmark_radius_percent: f32) -> Self {
+        self.landmark_radius_percent = landmark_radius_percent;
+        self
+    }
 }
 
 impl<'a, P: Pixel + DefaultPixel> Default for DrawLandmarksOptions<'a, P> {

diff --git a/src/tasks/common/options/face_landmark_options.rs b/src/tasks/common/options/face_landmark_options.rs
@@ -0,0 +1,158 @@
+#[derive(Clone)]
+pub(crate) struct FaceLandmarkOptions {
+    /// The maximum number of faces can be detected by the FaceLandmarker.
+    pub num_faces: i32,
+
+    /// The minimum confidence score for the face detection to be considered successful.
+    pub min_face_detection_confidence: f32,
+
+    /// The minimum confidence score of face presence score in the face landmark detection.
+    pub min_face_presence_confidence: f32,
+
+    /// The minimum confidence score for the face tracking to be considered successful.
+    pub min_tracking_confidence: f32,
+
+    /// Whether Face Landmarker outputs face blendshapes.
+    /// Face blendshapes are used for rendering the 3D face model.
+    pub output_face_blendshapes: bool,
+
+    /// Whether FaceLandmarker outputs the facial transformation matrix.
+    /// FaceLandmarker uses the matrix to transform the face landmarks from a canonical face model
+    /// to the detected face, so users can apply effects on the detected landmarks.
+    pub output_facial_transformation_matrixes: bool,
+}
+
+impl Default for FaceLandmarkOptions {
+    #[inline(always)]
+    fn default() -> Self {
+        Self {
+            num_faces: 1,
+            min_face_detection_confidence: 0.5,
+            min_face_presence_confidence: 0.5,
+            min_tracking_confidence: 0.5,
+            output_face_blendshapes: false,
+            output_facial_transformation_matrixes: false,
+        }
+    }
+}
+
+macro_rules! face_landmark_options_impl {
+    () => {
+        /// Set the maximum number of faces can be detected by the FaceLandmarker.
+        #[inline(always)]
+        pub fn num_faces(mut self, num_faces: i32) -> Self {
+            self.face_landmark_options.num_faces = num_faces;
+            self
+        }
+
+        /// Set the minimum confidence score for the face detection to be considered successful.
+        #[inline(always)]
+        pub fn min_face_detection_confidence(mut self, min_face_detection_confidence: f32) -> Self {
+            self.face_landmark_options.min_face_detection_confidence =
+                min_face_detection_confidence;
+            self
+        }
+
+        /// Set the minimum confidence score of face presence score in the face landmark detection.
+        #[inline(always)]
+        pub fn min_face_presence_confidence(mut self, min_face_presence_confidence: f32) -> Self {
+            self.face_landmark_options.min_face_presence_confidence = min_face_presence_confidence;
+            self
+        }
+
+        /// Set the minimum confidence score for the face tracking to be considered successful.
+        #[inline(always)]
+        pub fn min_tracking_confidence(mut self, min_tracking_confidence: f32) -> Self {
+            self.face_landmark_options.min_tracking_confidence = min_tracking_confidence;
+            self
+        }
+
+        /// Set whether FaceLandmarker outputs face blendshapes.
+        pub fn output_face_blendshapes(mut self, output_face_blendshapes: bool) -> Self {
+            self.face_landmark_options.output_face_blendshapes = output_face_blendshapes;
+            self
+        }
+
+        /// Set whether FaceLandmarker outputs the facial transformation matrix.
+        pub fn output_facial_transformation_matrixes(
+            mut self,
+            output_facial_transformation_matrixes: bool,
+        ) -> Self {
+            self.face_landmark_options.output_facial_transformation_matrixes =
+                output_facial_transformation_matrixes;
+            self
+        }
+    };
+}
+
+macro_rules! face_landmark_options_check {
+    ( $self:ident ) => {{
+        if $self.face_landmark_options.num_faces == 0 {
+            return Err(crate::Error::ArgumentError(
+                "The number of max faces cannot be zero".into(),
+            ));
+        }
+        if $self.face_landmark_options.min_face_presence_confidence < 0.
+            || $self.face_landmark_options.min_face_presence_confidence > 1.
+        {
+            return Err(crate::Error::ArgumentError(format!(
+                "The min_face_presence_confidence must in range [0.0, 1.0], but got `{}`",
+                $self.face_landmark_options.min_face_presence_confidence
+            )));
+        }
+        if $self.face_landmark_options.min_face_detection_confidence < 0.
+            || $self.face_landmark_options.min_face_detection_confidence > 1.
+        {
+            return Err(crate::Error::ArgumentError(format!(
+                "The min_face_detection_confidence must in range [0.0, 1.0], but got `{}`",
+                $self.face_landmark_options.min_face_detection_confidence
+            )));
+        }
+    }};
+}
+
+macro_rules! face_landmark_options_get_impl {
+    () => {
+        /// Get the maximum number of faces can be detected by the FaceLandmarker.
+        #[inline(always)]
+        pub fn num_faces(&self) -> i32 {
+            self.build_options.face_landmark_options.num_faces
+        }
+
+        /// Get the minimum confidence score for the face detection to be considered successful.
+        #[inline(always)]
+        pub fn min_face_detection_confidence(&self) -> f32 {
+            self.build_options
+                .face_landmark_options
+                .min_face_detection_confidence
+        }
+
+        /// Get the minimum confidence score of face presence score in the face landmark detection.
+        #[inline(always)]
+        pub fn min_face_presence_confidence(&self) -> f32 {
+            self.build_options
+                .face_landmark_options
+                .min_face_presence_confidence
+        }
+
+        /// Get the minimum confidence score for the face tracking to be considered successful.
+        #[inline(always)]
+        pub fn min_tracking_confidence(&self) -> f32 {
+            self.build_options
+                .face_landmark_options
+                .min_tracking_confidence
+        }
+
+        /// Get whether FaceLandmarker outputs face blendshapes.
+        pub fn output_face_blendshapes(&self) -> bool {
+            self.build_options.face_landmark_options.output_face_blendshapes
+        }
+
+        /// Get whether FaceLandmarker outputs the facial transformation matrix.
+        pub fn output_facial_transformation_matrixes(&self) -> bool {
+            self.build_options
+                .face_landmark_options
+                .output_facial_transformation_matrixes
+        }
+    };
+}
diff --git a/src/tasks/common/options/mod.rs b/src/tasks/common/options/mod.rs
@@ -12,9 +12,14 @@ mod embedding_options;
 #[macro_use]
 #[cfg(feature = "vision")]
 mod hand_landmark_options;
+#[macro_use]
+#[cfg(feature = "vision")]
+mod face_landmark_options;
 
 pub(crate) use base_task_options::BaseTaskOptions;
 pub(crate) use classification_options::ClassificationOptions;
 pub(crate) use embedding_options::EmbeddingOptions;
 #[cfg(feature = "vision")]
 pub(crate) use hand_landmark_options::HandLandmarkOptions;
+#[cfg(feature = "vision")]
+pub(crate) use face_landmark_options::FaceLandmarkOptions;